litagin commited on
Commit
b6b44c3
·
verified ·
1 Parent(s): dd1b89b

Delete text

Browse files
text/__init__.py DELETED
@@ -1,30 +0,0 @@
1
- from text.symbols import *
2
-
3
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
-
5
-
6
- def cleaned_text_to_sequence(cleaned_text, tones, language):
7
- """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
- Args:
9
- text: string to convert to a sequence
10
- Returns:
11
- List of integers corresponding to the symbols in the text
12
- """
13
- phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
- tone_start = language_tone_start_map[language]
15
- tones = [i + tone_start for i in tones]
16
- lang_id = language_id_map[language]
17
- lang_ids = [lang_id for i in phones]
18
- return phones, tones, lang_ids
19
-
20
-
21
- def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22
- from .chinese_bert import get_bert_feature as zh_bert
23
- from .english_bert_mock import get_bert_feature as en_bert
24
- from .japanese_bert import get_bert_feature as jp_bert
25
-
26
- lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
27
- bert = lang_bert_func_map[language](
28
- norm_text, word2ph, device, style_text, style_weight
29
- )
30
- return bert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/chinese.py DELETED
@@ -1,199 +0,0 @@
1
- import os
2
- import re
3
-
4
- import cn2an
5
- from pypinyin import lazy_pinyin, Style
6
-
7
- from text.symbols import punctuation
8
- from text.tone_sandhi import ToneSandhi
9
-
10
- current_file_path = os.path.dirname(__file__)
11
- pinyin_to_symbol_map = {
12
- line.split("\t")[0]: line.strip().split("\t")[1]
13
- for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14
- }
15
-
16
- import jieba.posseg as psg
17
-
18
-
19
- rep_map = {
20
- ":": ",",
21
- ";": ",",
22
- ",": ",",
23
- "。": ".",
24
- "!": "!",
25
- "?": "?",
26
- "\n": ".",
27
- "·": ",",
28
- "、": ",",
29
- "...": "…",
30
- "$": ".",
31
- "“": "'",
32
- "”": "'",
33
- '"': "'",
34
- "‘": "'",
35
- "’": "'",
36
- "(": "'",
37
- ")": "'",
38
- "(": "'",
39
- ")": "'",
40
- "《": "'",
41
- "》": "'",
42
- "【": "'",
43
- "】": "'",
44
- "[": "'",
45
- "]": "'",
46
- "—": "-",
47
- "~": "-",
48
- "~": "-",
49
- "「": "'",
50
- "」": "'",
51
- }
52
-
53
- tone_modifier = ToneSandhi()
54
-
55
-
56
- def replace_punctuation(text):
57
- text = text.replace("嗯", "恩").replace("呣", "母")
58
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
59
-
60
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
61
-
62
- replaced_text = re.sub(
63
- r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
64
- )
65
-
66
- return replaced_text
67
-
68
-
69
- def g2p(text):
70
- pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
71
- sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
72
- phones, tones, word2ph = _g2p(sentences)
73
- assert sum(word2ph) == len(phones)
74
- assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
75
- phones = ["_"] + phones + ["_"]
76
- tones = [0] + tones + [0]
77
- word2ph = [1] + word2ph + [1]
78
- return phones, tones, word2ph
79
-
80
-
81
- def _get_initials_finals(word):
82
- initials = []
83
- finals = []
84
- orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
85
- orig_finals = lazy_pinyin(
86
- word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
87
- )
88
- for c, v in zip(orig_initials, orig_finals):
89
- initials.append(c)
90
- finals.append(v)
91
- return initials, finals
92
-
93
-
94
- def _g2p(segments):
95
- phones_list = []
96
- tones_list = []
97
- word2ph = []
98
- for seg in segments:
99
- # Replace all English words in the sentence
100
- seg = re.sub("[a-zA-Z]+", "", seg)
101
- seg_cut = psg.lcut(seg)
102
- initials = []
103
- finals = []
104
- seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
105
- for word, pos in seg_cut:
106
- if pos == "eng":
107
- continue
108
- sub_initials, sub_finals = _get_initials_finals(word)
109
- sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
110
- initials.append(sub_initials)
111
- finals.append(sub_finals)
112
-
113
- # assert len(sub_initials) == len(sub_finals) == len(word)
114
- initials = sum(initials, [])
115
- finals = sum(finals, [])
116
- #
117
- for c, v in zip(initials, finals):
118
- raw_pinyin = c + v
119
- # NOTE: post process for pypinyin outputs
120
- # we discriminate i, ii and iii
121
- if c == v:
122
- assert c in punctuation
123
- phone = [c]
124
- tone = "0"
125
- word2ph.append(1)
126
- else:
127
- v_without_tone = v[:-1]
128
- tone = v[-1]
129
-
130
- pinyin = c + v_without_tone
131
- assert tone in "12345"
132
-
133
- if c:
134
- # 多音节
135
- v_rep_map = {
136
- "uei": "ui",
137
- "iou": "iu",
138
- "uen": "un",
139
- }
140
- if v_without_tone in v_rep_map.keys():
141
- pinyin = c + v_rep_map[v_without_tone]
142
- else:
143
- # 单音节
144
- pinyin_rep_map = {
145
- "ing": "ying",
146
- "i": "yi",
147
- "in": "yin",
148
- "u": "wu",
149
- }
150
- if pinyin in pinyin_rep_map.keys():
151
- pinyin = pinyin_rep_map[pinyin]
152
- else:
153
- single_rep_map = {
154
- "v": "yu",
155
- "e": "e",
156
- "i": "y",
157
- "u": "w",
158
- }
159
- if pinyin[0] in single_rep_map.keys():
160
- pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
161
-
162
- assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
163
- phone = pinyin_to_symbol_map[pinyin].split(" ")
164
- word2ph.append(len(phone))
165
-
166
- phones_list += phone
167
- tones_list += [int(tone)] * len(phone)
168
- return phones_list, tones_list, word2ph
169
-
170
-
171
- def text_normalize(text):
172
- numbers = re.findall(r"\d+(?:\.?\d+)?", text)
173
- for number in numbers:
174
- text = text.replace(number, cn2an.an2cn(number), 1)
175
- text = replace_punctuation(text)
176
- return text
177
-
178
-
179
- def get_bert_feature(text, word2ph):
180
- from text import chinese_bert
181
-
182
- return chinese_bert.get_bert_feature(text, word2ph)
183
-
184
-
185
- if __name__ == "__main__":
186
- from text.chinese_bert import get_bert_feature
187
-
188
- text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
189
- text = text_normalize(text)
190
- print(text)
191
- phones, tones, word2ph = g2p(text)
192
- bert = get_bert_feature(text, word2ph)
193
-
194
- print(phones, tones, word2ph, bert.shape)
195
-
196
-
197
- # # 示例用法
198
- # text = "这是一个示例文本:,你好!这是一个测试...."
199
- # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/chinese_bert.py DELETED
@@ -1,119 +0,0 @@
1
- import sys
2
-
3
- import torch
4
- from transformers import AutoModelForMaskedLM, AutoTokenizer
5
-
6
- from config import config
7
-
8
- LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9
-
10
- tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11
-
12
- models = dict()
13
-
14
-
15
- def get_bert_feature(
16
- text,
17
- word2ph,
18
- device=config.bert_gen_config.device,
19
- style_text=None,
20
- style_weight=0.7,
21
- ):
22
- if (
23
- sys.platform == "darwin"
24
- and torch.backends.mps.is_available()
25
- and device == "cpu"
26
- ):
27
- device = "mps"
28
- if not device:
29
- device = "cuda"
30
- if device not in models.keys():
31
- models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
32
- with torch.no_grad():
33
- inputs = tokenizer(text, return_tensors="pt")
34
- for i in inputs:
35
- inputs[i] = inputs[i].to(device)
36
- res = models[device](**inputs, output_hidden_states=True)
37
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38
- if style_text:
39
- style_inputs = tokenizer(style_text, return_tensors="pt")
40
- for i in style_inputs:
41
- style_inputs[i] = style_inputs[i].to(device)
42
- style_res = models[device](**style_inputs, output_hidden_states=True)
43
- style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
44
- style_res_mean = style_res.mean(0)
45
- assert len(word2ph) == len(text) + 2
46
- word2phone = word2ph
47
- phone_level_feature = []
48
- for i in range(len(word2phone)):
49
- if style_text:
50
- repeat_feature = (
51
- res[i].repeat(word2phone[i], 1) * (1 - style_weight)
52
- + style_res_mean.repeat(word2phone[i], 1) * style_weight
53
- )
54
- else:
55
- repeat_feature = res[i].repeat(word2phone[i], 1)
56
- phone_level_feature.append(repeat_feature)
57
-
58
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
59
-
60
- return phone_level_feature.T
61
-
62
-
63
- if __name__ == "__main__":
64
- word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
65
- word2phone = [
66
- 1,
67
- 2,
68
- 1,
69
- 2,
70
- 2,
71
- 1,
72
- 2,
73
- 2,
74
- 1,
75
- 2,
76
- 2,
77
- 1,
78
- 2,
79
- 2,
80
- 2,
81
- 2,
82
- 2,
83
- 1,
84
- 1,
85
- 2,
86
- 2,
87
- 1,
88
- 2,
89
- 2,
90
- 2,
91
- 2,
92
- 1,
93
- 2,
94
- 2,
95
- 2,
96
- 2,
97
- 2,
98
- 1,
99
- 2,
100
- 2,
101
- 2,
102
- 2,
103
- 1,
104
- ]
105
-
106
- # 计算总帧数
107
- total_frames = sum(word2phone)
108
- print(word_level_feature.shape)
109
- print(word2phone)
110
- phone_level_feature = []
111
- for i in range(len(word2phone)):
112
- print(word_level_feature[i].shape)
113
-
114
- # 对每个词重复word2phone[i]次
115
- repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
116
- phone_level_feature.append(repeat_feature)
117
-
118
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
119
- print(phone_level_feature.shape) # torch.Size([36, 1024])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cleaner.py DELETED
@@ -1,28 +0,0 @@
1
- from text import chinese, japanese, english, cleaned_text_to_sequence
2
-
3
-
4
- language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5
-
6
-
7
- def clean_text(text, language):
8
- language_module = language_module_map[language]
9
- norm_text = language_module.text_normalize(text)
10
- phones, tones, word2ph = language_module.g2p(norm_text)
11
- return norm_text, phones, tones, word2ph
12
-
13
-
14
- def clean_text_bert(text, language):
15
- language_module = language_module_map[language]
16
- norm_text = language_module.text_normalize(text)
17
- phones, tones, word2ph = language_module.g2p(norm_text)
18
- bert = language_module.get_bert_feature(norm_text, word2ph)
19
- return phones, tones, bert
20
-
21
-
22
- def text_to_sequence(text, language):
23
- norm_text, phones, tones, word2ph = clean_text(text, language)
24
- return cleaned_text_to_sequence(phones, tones, language)
25
-
26
-
27
- if __name__ == "__main__":
28
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cmudict.rep DELETED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
- size 6212655
 
 
 
 
text/english.py DELETED
@@ -1,495 +0,0 @@
1
- import pickle
2
- import os
3
- import re
4
- from g2p_en import G2p
5
- from transformers import DebertaV2Tokenizer
6
-
7
- from text import symbols
8
- from text.symbols import punctuation
9
-
10
- current_file_path = os.path.dirname(__file__)
11
- CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
12
- CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
13
- _g2p = G2p()
14
- LOCAL_PATH = "./bert/deberta-v3-large"
15
- tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
16
-
17
- arpa = {
18
- "AH0",
19
- "S",
20
- "AH1",
21
- "EY2",
22
- "AE2",
23
- "EH0",
24
- "OW2",
25
- "UH0",
26
- "NG",
27
- "B",
28
- "G",
29
- "AY0",
30
- "M",
31
- "AA0",
32
- "F",
33
- "AO0",
34
- "ER2",
35
- "UH1",
36
- "IY1",
37
- "AH2",
38
- "DH",
39
- "IY0",
40
- "EY1",
41
- "IH0",
42
- "K",
43
- "N",
44
- "W",
45
- "IY2",
46
- "T",
47
- "AA1",
48
- "ER1",
49
- "EH2",
50
- "OY0",
51
- "UH2",
52
- "UW1",
53
- "Z",
54
- "AW2",
55
- "AW1",
56
- "V",
57
- "UW2",
58
- "AA2",
59
- "ER",
60
- "AW0",
61
- "UW0",
62
- "R",
63
- "OW1",
64
- "EH1",
65
- "ZH",
66
- "AE0",
67
- "IH2",
68
- "IH",
69
- "Y",
70
- "JH",
71
- "P",
72
- "AY1",
73
- "EY0",
74
- "OY2",
75
- "TH",
76
- "HH",
77
- "D",
78
- "ER0",
79
- "CH",
80
- "AO1",
81
- "AE1",
82
- "AO2",
83
- "OY1",
84
- "AY2",
85
- "IH1",
86
- "OW0",
87
- "L",
88
- "SH",
89
- }
90
-
91
-
92
- def post_replace_ph(ph):
93
- rep_map = {
94
- ":": ",",
95
- ";": ",",
96
- ",": ",",
97
- "。": ".",
98
- "!": "!",
99
- "?": "?",
100
- "\n": ".",
101
- "·": ",",
102
- "、": ",",
103
- "…": "...",
104
- "···": "...",
105
- "・・・": "...",
106
- "v": "V",
107
- }
108
- if ph in rep_map.keys():
109
- ph = rep_map[ph]
110
- if ph in symbols:
111
- return ph
112
- if ph not in symbols:
113
- ph = "UNK"
114
- return ph
115
-
116
-
117
- rep_map = {
118
- ":": ",",
119
- ";": ",",
120
- ",": ",",
121
- "。": ".",
122
- "!": "!",
123
- "?": "?",
124
- "\n": ".",
125
- ".": ".",
126
- "…": "...",
127
- "···": "...",
128
- "・・・": "...",
129
- "·": ",",
130
- "・": ",",
131
- "、": ",",
132
- "$": ".",
133
- "“": "'",
134
- "”": "'",
135
- '"': "'",
136
- "‘": "'",
137
- "’": "'",
138
- "(": "'",
139
- ")": "'",
140
- "(": "'",
141
- ")": "'",
142
- "《": "'",
143
- "》": "'",
144
- "【": "'",
145
- "】": "'",
146
- "[": "'",
147
- "]": "'",
148
- "—": "-",
149
- "−": "-",
150
- "~": "-",
151
- "~": "-",
152
- "「": "'",
153
- "」": "'",
154
- }
155
-
156
-
157
- def replace_punctuation(text):
158
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
159
-
160
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
161
-
162
- # replaced_text = re.sub(
163
- # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
164
- # + "".join(punctuation)
165
- # + r"]+",
166
- # "",
167
- # replaced_text,
168
- # )
169
-
170
- return replaced_text
171
-
172
-
173
- def read_dict():
174
- g2p_dict = {}
175
- start_line = 49
176
- with open(CMU_DICT_PATH) as f:
177
- line = f.readline()
178
- line_index = 1
179
- while line:
180
- if line_index >= start_line:
181
- line = line.strip()
182
- word_split = line.split(" ")
183
- word = word_split[0]
184
-
185
- syllable_split = word_split[1].split(" - ")
186
- g2p_dict[word] = []
187
- for syllable in syllable_split:
188
- phone_split = syllable.split(" ")
189
- g2p_dict[word].append(phone_split)
190
-
191
- line_index = line_index + 1
192
- line = f.readline()
193
-
194
- return g2p_dict
195
-
196
-
197
- def cache_dict(g2p_dict, file_path):
198
- with open(file_path, "wb") as pickle_file:
199
- pickle.dump(g2p_dict, pickle_file)
200
-
201
-
202
- def get_dict():
203
- if os.path.exists(CACHE_PATH):
204
- with open(CACHE_PATH, "rb") as pickle_file:
205
- g2p_dict = pickle.load(pickle_file)
206
- else:
207
- g2p_dict = read_dict()
208
- cache_dict(g2p_dict, CACHE_PATH)
209
-
210
- return g2p_dict
211
-
212
-
213
- eng_dict = get_dict()
214
-
215
-
216
- def refine_ph(phn):
217
- tone = 0
218
- if re.search(r"\d$", phn):
219
- tone = int(phn[-1]) + 1
220
- phn = phn[:-1]
221
- else:
222
- tone = 3
223
- return phn.lower(), tone
224
-
225
-
226
- def refine_syllables(syllables):
227
- tones = []
228
- phonemes = []
229
- for phn_list in syllables:
230
- for i in range(len(phn_list)):
231
- phn = phn_list[i]
232
- phn, tone = refine_ph(phn)
233
- phonemes.append(phn)
234
- tones.append(tone)
235
- return phonemes, tones
236
-
237
-
238
- import re
239
- import inflect
240
-
241
- _inflect = inflect.engine()
242
- _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
243
- _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
244
- _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
245
- _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
246
- _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
247
- _number_re = re.compile(r"[0-9]+")
248
-
249
- # List of (regular expression, replacement) pairs for abbreviations:
250
- _abbreviations = [
251
- (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
252
- for x in [
253
- ("mrs", "misess"),
254
- ("mr", "mister"),
255
- ("dr", "doctor"),
256
- ("st", "saint"),
257
- ("co", "company"),
258
- ("jr", "junior"),
259
- ("maj", "major"),
260
- ("gen", "general"),
261
- ("drs", "doctors"),
262
- ("rev", "reverend"),
263
- ("lt", "lieutenant"),
264
- ("hon", "honorable"),
265
- ("sgt", "sergeant"),
266
- ("capt", "captain"),
267
- ("esq", "esquire"),
268
- ("ltd", "limited"),
269
- ("col", "colonel"),
270
- ("ft", "fort"),
271
- ]
272
- ]
273
-
274
-
275
- # List of (ipa, lazy ipa) pairs:
276
- _lazy_ipa = [
277
- (re.compile("%s" % x[0]), x[1])
278
- for x in [
279
- ("r", "ɹ"),
280
- ("æ", "e"),
281
- ("ɑ", "a"),
282
- ("ɔ", "o"),
283
- ("ð", "z"),
284
- ("θ", "s"),
285
- ("ɛ", "e"),
286
- ("ɪ", "i"),
287
- ("ʊ", "u"),
288
- ("ʒ", "ʥ"),
289
- ("ʤ", "ʥ"),
290
- ("ˈ", "↓"),
291
- ]
292
- ]
293
-
294
- # List of (ipa, lazy ipa2) pairs:
295
- _lazy_ipa2 = [
296
- (re.compile("%s" % x[0]), x[1])
297
- for x in [
298
- ("r", "ɹ"),
299
- ("ð", "z"),
300
- ("θ", "s"),
301
- ("ʒ", "ʑ"),
302
- ("ʤ", "dʑ"),
303
- ("ˈ", "↓"),
304
- ]
305
- ]
306
-
307
- # List of (ipa, ipa2) pairs
308
- _ipa_to_ipa2 = [
309
- (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
310
- ]
311
-
312
-
313
- def _expand_dollars(m):
314
- match = m.group(1)
315
- parts = match.split(".")
316
- if len(parts) > 2:
317
- return match + " dollars" # Unexpected format
318
- dollars = int(parts[0]) if parts[0] else 0
319
- cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
320
- if dollars and cents:
321
- dollar_unit = "dollar" if dollars == 1 else "dollars"
322
- cent_unit = "cent" if cents == 1 else "cents"
323
- return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
324
- elif dollars:
325
- dollar_unit = "dollar" if dollars == 1 else "dollars"
326
- return "%s %s" % (dollars, dollar_unit)
327
- elif cents:
328
- cent_unit = "cent" if cents == 1 else "cents"
329
- return "%s %s" % (cents, cent_unit)
330
- else:
331
- return "zero dollars"
332
-
333
-
334
- def _remove_commas(m):
335
- return m.group(1).replace(",", "")
336
-
337
-
338
- def _expand_ordinal(m):
339
- return _inflect.number_to_words(m.group(0))
340
-
341
-
342
- def _expand_number(m):
343
- num = int(m.group(0))
344
- if num > 1000 and num < 3000:
345
- if num == 2000:
346
- return "two thousand"
347
- elif num > 2000 and num < 2010:
348
- return "two thousand " + _inflect.number_to_words(num % 100)
349
- elif num % 100 == 0:
350
- return _inflect.number_to_words(num // 100) + " hundred"
351
- else:
352
- return _inflect.number_to_words(
353
- num, andword="", zero="oh", group=2
354
- ).replace(", ", " ")
355
- else:
356
- return _inflect.number_to_words(num, andword="")
357
-
358
-
359
- def _expand_decimal_point(m):
360
- return m.group(1).replace(".", " point ")
361
-
362
-
363
- def normalize_numbers(text):
364
- text = re.sub(_comma_number_re, _remove_commas, text)
365
- text = re.sub(_pounds_re, r"\1 pounds", text)
366
- text = re.sub(_dollars_re, _expand_dollars, text)
367
- text = re.sub(_decimal_number_re, _expand_decimal_point, text)
368
- text = re.sub(_ordinal_re, _expand_ordinal, text)
369
- text = re.sub(_number_re, _expand_number, text)
370
- return text
371
-
372
-
373
- def text_normalize(text):
374
- text = normalize_numbers(text)
375
- text = replace_punctuation(text)
376
- text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
377
- return text
378
-
379
-
380
- def distribute_phone(n_phone, n_word):
381
- phones_per_word = [0] * n_word
382
- for task in range(n_phone):
383
- min_tasks = min(phones_per_word)
384
- min_index = phones_per_word.index(min_tasks)
385
- phones_per_word[min_index] += 1
386
- return phones_per_word
387
-
388
-
389
- def sep_text(text):
390
- words = re.split(r"([,;.\?\!\s+])", text)
391
- words = [word for word in words if word.strip() != ""]
392
- return words
393
-
394
-
395
- def text_to_words(text):
396
- tokens = tokenizer.tokenize(text)
397
- words = []
398
- for idx, t in enumerate(tokens):
399
- if t.startswith("▁"):
400
- words.append([t[1:]])
401
- else:
402
- if t in punctuation:
403
- if idx == len(tokens) - 1:
404
- words.append([f"{t}"])
405
- else:
406
- if (
407
- not tokens[idx + 1].startswith("▁")
408
- and tokens[idx + 1] not in punctuation
409
- ):
410
- if idx == 0:
411
- words.append([])
412
- words[-1].append(f"{t}")
413
- else:
414
- words.append([f"{t}"])
415
- else:
416
- if idx == 0:
417
- words.append([])
418
- words[-1].append(f"{t}")
419
- return words
420
-
421
-
422
- def g2p(text):
423
- phones = []
424
- tones = []
425
- phone_len = []
426
- # words = sep_text(text)
427
- # tokens = [tokenizer.tokenize(i) for i in words]
428
- words = text_to_words(text)
429
-
430
- for word in words:
431
- temp_phones, temp_tones = [], []
432
- if len(word) > 1:
433
- if "'" in word:
434
- word = ["".join(word)]
435
- for w in word:
436
- if w in punctuation:
437
- temp_phones.append(w)
438
- temp_tones.append(0)
439
- continue
440
- if w.upper() in eng_dict:
441
- phns, tns = refine_syllables(eng_dict[w.upper()])
442
- temp_phones += [post_replace_ph(i) for i in phns]
443
- temp_tones += tns
444
- # w2ph.append(len(phns))
445
- else:
446
- phone_list = list(filter(lambda p: p != " ", _g2p(w)))
447
- phns = []
448
- tns = []
449
- for ph in phone_list:
450
- if ph in arpa:
451
- ph, tn = refine_ph(ph)
452
- phns.append(ph)
453
- tns.append(tn)
454
- else:
455
- phns.append(ph)
456
- tns.append(0)
457
- temp_phones += [post_replace_ph(i) for i in phns]
458
- temp_tones += tns
459
- phones += temp_phones
460
- tones += temp_tones
461
- phone_len.append(len(temp_phones))
462
- # phones = [post_replace_ph(i) for i in phones]
463
-
464
- word2ph = []
465
- for token, pl in zip(words, phone_len):
466
- word_len = len(token)
467
-
468
- aaa = distribute_phone(pl, word_len)
469
- word2ph += aaa
470
-
471
- phones = ["_"] + phones + ["_"]
472
- tones = [0] + tones + [0]
473
- word2ph = [1] + word2ph + [1]
474
- assert len(phones) == len(tones), text
475
- assert len(phones) == sum(word2ph), text
476
-
477
- return phones, tones, word2ph
478
-
479
-
480
- def get_bert_feature(text, word2ph):
481
- from text import english_bert_mock
482
-
483
- return english_bert_mock.get_bert_feature(text, word2ph)
484
-
485
-
486
- if __name__ == "__main__":
487
- # print(get_dict())
488
- # print(eng_word_to_phoneme("hello"))
489
- print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
490
- # all_phones = set()
491
- # for k, syllables in eng_dict.items():
492
- # for group in syllables:
493
- # for ph in group:
494
- # all_phones.add(ph)
495
- # print(all_phones)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/english_bert_mock.py DELETED
@@ -1,61 +0,0 @@
1
- import sys
2
-
3
- import torch
4
- from transformers import DebertaV2Model, DebertaV2Tokenizer
5
-
6
- from config import config
7
-
8
-
9
- LOCAL_PATH = "./bert/deberta-v3-large"
10
-
11
- tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12
-
13
- models = dict()
14
-
15
-
16
- def get_bert_feature(
17
- text,
18
- word2ph,
19
- device=config.bert_gen_config.device,
20
- style_text=None,
21
- style_weight=0.7,
22
- ):
23
- if (
24
- sys.platform == "darwin"
25
- and torch.backends.mps.is_available()
26
- and device == "cpu"
27
- ):
28
- device = "mps"
29
- if not device:
30
- device = "cuda"
31
- if device not in models.keys():
32
- models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
33
- with torch.no_grad():
34
- inputs = tokenizer(text, return_tensors="pt")
35
- for i in inputs:
36
- inputs[i] = inputs[i].to(device)
37
- res = models[device](**inputs, output_hidden_states=True)
38
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
39
- if style_text:
40
- style_inputs = tokenizer(style_text, return_tensors="pt")
41
- for i in style_inputs:
42
- style_inputs[i] = style_inputs[i].to(device)
43
- style_res = models[device](**style_inputs, output_hidden_states=True)
44
- style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
45
- style_res_mean = style_res.mean(0)
46
- assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
47
- word2phone = word2ph
48
- phone_level_feature = []
49
- for i in range(len(word2phone)):
50
- if style_text:
51
- repeat_feature = (
52
- res[i].repeat(word2phone[i], 1) * (1 - style_weight)
53
- + style_res_mean.repeat(word2phone[i], 1) * style_weight
54
- )
55
- else:
56
- repeat_feature = res[i].repeat(word2phone[i], 1)
57
- phone_level_feature.append(repeat_feature)
58
-
59
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
60
-
61
- return phone_level_feature.T
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/japanese.py DELETED
@@ -1,432 +0,0 @@
1
- # Convert Japanese text to phonemes which is
2
- # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
- import re
4
- import unicodedata
5
-
6
- from transformers import AutoTokenizer
7
-
8
- from text import punctuation, symbols
9
-
10
- from num2words import num2words
11
-
12
- import pyopenjtalk
13
- import jaconv
14
-
15
-
16
- def kata2phoneme(text: str) -> str:
17
- """Convert katakana text to phonemes."""
18
- text = text.strip()
19
- if text == "ー":
20
- return ["ー"]
21
- elif text.startswith("ー"):
22
- return ["ー"] + kata2phoneme(text[1:])
23
- res = []
24
- prev = None
25
- while text:
26
- if re.match(_MARKS, text):
27
- res.append(text)
28
- text = text[1:]
29
- continue
30
- if text.startswith("ー"):
31
- if prev:
32
- res.append(prev[-1])
33
- text = text[1:]
34
- continue
35
- res += pyopenjtalk.g2p(text).lower().replace("cl", "q").split(" ")
36
- break
37
- # res = _COLON_RX.sub(":", res)
38
- return res
39
-
40
-
41
- def hira2kata(text: str) -> str:
42
- return jaconv.hira2kata(text)
43
-
44
-
45
- _SYMBOL_TOKENS = set(list("・、。?!"))
46
- _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
47
- _MARKS = re.compile(
48
- r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
49
- )
50
-
51
-
52
- def text2kata(text: str) -> str:
53
- parsed = pyopenjtalk.run_frontend(text)
54
-
55
- res = []
56
- for parts in parsed:
57
- word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
58
- "’", ""
59
- )
60
- if yomi:
61
- if re.match(_MARKS, yomi):
62
- if len(word) > 1:
63
- word = [replace_punctuation(i) for i in list(word)]
64
- yomi = word
65
- res += yomi
66
- sep += word
67
- continue
68
- elif word not in rep_map.keys() and word not in rep_map.values():
69
- word = ","
70
- yomi = word
71
- res.append(yomi)
72
- else:
73
- if word in _SYMBOL_TOKENS:
74
- res.append(word)
75
- elif word in ("っ", "ッ"):
76
- res.append("ッ")
77
- elif word in _NO_YOMI_TOKENS:
78
- pass
79
- else:
80
- res.append(word)
81
- return hira2kata("".join(res))
82
-
83
-
84
- def text2sep_kata(text: str) -> (list, list):
85
- parsed = pyopenjtalk.run_frontend(text)
86
-
87
- res = []
88
- sep = []
89
- for parts in parsed:
90
- word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
91
- "’", ""
92
- )
93
- if yomi:
94
- if re.match(_MARKS, yomi):
95
- if len(word) > 1:
96
- word = [replace_punctuation(i) for i in list(word)]
97
- yomi = word
98
- res += yomi
99
- sep += word
100
- continue
101
- elif word not in rep_map.keys() and word not in rep_map.values():
102
- word = ","
103
- yomi = word
104
- res.append(yomi)
105
- else:
106
- if word in _SYMBOL_TOKENS:
107
- res.append(word)
108
- elif word in ("っ", "ッ"):
109
- res.append("ッ")
110
- elif word in _NO_YOMI_TOKENS:
111
- pass
112
- else:
113
- res.append(word)
114
- sep.append(word)
115
- return sep, [hira2kata(i) for i in res], get_accent(parsed)
116
-
117
-
118
- def get_accent(parsed):
119
- labels = pyopenjtalk.make_label(parsed)
120
-
121
- phonemes = []
122
- accents = []
123
- for n, label in enumerate(labels):
124
- phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
125
- if phoneme not in ["sil", "pau"]:
126
- phonemes.append(phoneme.replace("cl", "q").lower())
127
- else:
128
- continue
129
- a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
130
- a2 = int(re.search(r"\+(\d+)\+", label).group(1))
131
- if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
132
- a2_next = -1
133
- else:
134
- a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
135
- # Falling
136
- if a1 == 0 and a2_next == a2 + 1:
137
- accents.append(-1)
138
- # Rising
139
- elif a2 == 1 and a2_next == 2:
140
- accents.append(1)
141
- else:
142
- accents.append(0)
143
- return list(zip(phonemes, accents))
144
-
145
-
146
- _ALPHASYMBOL_YOMI = {
147
- "#": "シャープ",
148
- "%": "パーセント",
149
- "&": "アンド",
150
- "+": "プラス",
151
- "-": "マイナス",
152
- ":": "コロン",
153
- ";": "セミコロン",
154
- "<": "小なり",
155
- "=": "イコール",
156
- ">": "大なり",
157
- "@": "アット",
158
- "a": "エー",
159
- "b": "ビー",
160
- "c": "シー",
161
- "d": "ディー",
162
- "e": "イー",
163
- "f": "エフ",
164
- "g": "ジー",
165
- "h": "エイチ",
166
- "i": "アイ",
167
- "j": "ジェー",
168
- "k": "ケー",
169
- "l": "エル",
170
- "m": "エム",
171
- "n": "エヌ",
172
- "o": "オー",
173
- "p": "ピー",
174
- "q": "キュー",
175
- "r": "アール",
176
- "s": "エス",
177
- "t": "ティー",
178
- "u": "ユー",
179
- "v": "ブイ",
180
- "w": "ダブリュー",
181
- "x": "エックス",
182
- "y": "ワイ",
183
- "z": "ゼット",
184
- "α": "アルファ",
185
- "β": "ベータ",
186
- "γ": "ガンマ",
187
- "δ": "デルタ",
188
- "ε": "イプシロン",
189
- "ζ": "ゼータ",
190
- "η": "イータ",
191
- "θ": "シータ",
192
- "ι": "イオタ",
193
- "κ": "カッパ",
194
- "λ": "ラムダ",
195
- "μ": "ミュー",
196
- "ν": "ニュー",
197
- "ξ": "クサイ",
198
- "ο": "オミクロン",
199
- "π": "パイ",
200
- "ρ": "ロー",
201
- "σ": "シグマ",
202
- "τ": "タウ",
203
- "υ": "ウプシロン",
204
- "φ": "ファイ",
205
- "χ": "カイ",
206
- "ψ": "プサイ",
207
- "ω": "オメガ",
208
- }
209
-
210
-
211
- _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
212
- _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
213
- _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
214
- _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
215
-
216
-
217
- def japanese_convert_numbers_to_words(text: str) -> str:
218
- res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
219
- res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
220
- res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
221
- return res
222
-
223
-
224
- def japanese_convert_alpha_symbols_to_words(text: str) -> str:
225
- return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
226
-
227
-
228
- def japanese_text_to_phonemes(text: str) -> str:
229
- """Convert Japanese text to phonemes."""
230
- res = unicodedata.normalize("NFKC", text)
231
- res = japanese_convert_numbers_to_words(res)
232
- # res = japanese_convert_alpha_symbols_to_words(res)
233
- res = text2kata(res)
234
- res = kata2phoneme(res)
235
- return res
236
-
237
-
238
- def is_japanese_character(char):
239
- # 定义日语文字系统的 Unicode 范围
240
- japanese_ranges = [
241
- (0x3040, 0x309F), # 平假名
242
- (0x30A0, 0x30FF), # 片假名
243
- (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
244
- (0x3400, 0x4DBF), # 汉字扩展 A
245
- (0x20000, 0x2A6DF), # 汉字扩展 B
246
- # 可以根据需要添加其他汉字扩展范围
247
- ]
248
-
249
- # 将字符的 Unicode 编码转换为整数
250
- char_code = ord(char)
251
-
252
- # 检查字符是否在任何一个日语范围内
253
- for start, end in japanese_ranges:
254
- if start <= char_code <= end:
255
- return True
256
-
257
- return False
258
-
259
-
260
- rep_map = {
261
- ":": ",",
262
- ";": ",",
263
- ",": ",",
264
- "。": ".",
265
- "!": "!",
266
- "?": "?",
267
- "\n": ".",
268
- ".": ".",
269
- "…": "...",
270
- "···": "...",
271
- "・・・": "...",
272
- "·": ",",
273
- "・": ",",
274
- "、": ",",
275
- "$": ".",
276
- "“": "'",
277
- "”": "'",
278
- '"': "'",
279
- "‘": "'",
280
- "’": "'",
281
- "(": "'",
282
- ")": "'",
283
- "(": "'",
284
- ")": "'",
285
- "《": "'",
286
- "》": "'",
287
- "【": "'",
288
- "】": "'",
289
- "[": "'",
290
- "]": "'",
291
- "—": "-",
292
- "−": "-",
293
- "~": "-",
294
- "~": "-",
295
- "「": "'",
296
- "」": "'",
297
- }
298
-
299
-
300
- def replace_punctuation(text):
301
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
302
-
303
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
304
-
305
- replaced_text = re.sub(
306
- r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
307
- + "".join(punctuation)
308
- + r"]+",
309
- "",
310
- replaced_text,
311
- )
312
-
313
- return replaced_text
314
-
315
-
316
- def text_normalize(text):
317
- res = unicodedata.normalize("NFKC", text)
318
- res = japanese_convert_numbers_to_words(res)
319
- # res = "".join([i for i in res if is_japanese_character(i)])
320
- res = replace_punctuation(res)
321
- res = res.replace("゙", "")
322
- return res
323
-
324
-
325
- def distribute_phone(n_phone, n_word):
326
- phones_per_word = [0] * n_word
327
- for task in range(n_phone):
328
- min_tasks = min(phones_per_word)
329
- min_index = phones_per_word.index(min_tasks)
330
- phones_per_word[min_index] += 1
331
- return phones_per_word
332
-
333
-
334
- def handle_long(sep_phonemes):
335
- for i in range(len(sep_phonemes)):
336
- if sep_phonemes[i][0] == "ー":
337
- sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
338
- if "ー" in sep_phonemes[i]:
339
- for j in range(len(sep_phonemes[i])):
340
- if sep_phonemes[i][j] == "ー":
341
- sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
342
- return sep_phonemes
343
-
344
-
345
- tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese-char-wwm")
346
-
347
-
348
- def align_tones(phones, tones):
349
- res = []
350
- for pho in phones:
351
- temp = [0] * len(pho)
352
- for idx, p in enumerate(pho):
353
- if len(tones) == 0:
354
- break
355
- if p == tones[0][0]:
356
- temp[idx] = tones[0][1]
357
- if idx > 0:
358
- temp[idx] += temp[idx - 1]
359
- tones.pop(0)
360
- temp = [0] + temp
361
- temp = temp[:-1]
362
- if -1 in temp:
363
- temp = [i + 1 for i in temp]
364
- res.append(temp)
365
- res = [i for j in res for i in j]
366
- assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
367
- return res
368
-
369
-
370
- def rearrange_tones(tones, phones):
371
- res = [0] * len(tones)
372
- for i in range(len(tones)):
373
- if i == 0:
374
- if tones[i] not in punctuation:
375
- res[i] = 1
376
- elif tones[i] == prev:
377
- if phones[i] in punctuation:
378
- res[i] = 0
379
- else:
380
- res[i] = 1
381
- elif tones[i] > prev:
382
- res[i] = 2
383
- elif tones[i] < prev:
384
- res[i - 1] = 3
385
- res[i] = 1
386
- prev = tones[i]
387
- return res
388
-
389
-
390
- def g2p(norm_text):
391
- sep_text, sep_kata, acc = text2sep_kata(norm_text)
392
- sep_tokenized = []
393
- for i in sep_text:
394
- if i not in punctuation:
395
- sep_tokenized.append(tokenizer.tokenize(i))
396
- else:
397
- sep_tokenized.append([i])
398
-
399
- sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
400
- # 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
401
- for i in sep_phonemes:
402
- for j in i:
403
- assert j in symbols, (sep_text, sep_kata, sep_phonemes)
404
- tones = align_tones(sep_phonemes, acc)
405
-
406
- word2ph = []
407
- for token, phoneme in zip(sep_tokenized, sep_phonemes):
408
- phone_len = len(phoneme)
409
- word_len = len(token)
410
-
411
- aaa = distribute_phone(phone_len, word_len)
412
- word2ph += aaa
413
- phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
414
- # tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
415
- tones = [0] + tones + [0]
416
- word2ph = [1] + word2ph + [1]
417
- assert len(phones) == len(tones)
418
- return phones, tones, word2ph
419
-
420
-
421
- if __name__ == "__main__":
422
- tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
423
- text = "hello,こんにちは、世界ー!……"
424
- from text.japanese_bert import get_bert_feature
425
-
426
- text = text_normalize(text)
427
- print(text)
428
-
429
- phones, tones, word2ph = g2p(text)
430
- bert = get_bert_feature(text, word2ph)
431
-
432
- print(phones, tones, word2ph, bert.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/japanese_bert.py DELETED
@@ -1,65 +0,0 @@
1
- import sys
2
-
3
- import torch
4
- from transformers import AutoModelForMaskedLM, AutoTokenizer
5
-
6
- from config import config
7
- from text.japanese import text2sep_kata
8
-
9
- LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10
-
11
- tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12
-
13
- models = dict()
14
-
15
-
16
- def get_bert_feature(
17
- text,
18
- word2ph,
19
- device=config.bert_gen_config.device,
20
- style_text=None,
21
- style_weight=0.7,
22
- ):
23
- text = "".join(text2sep_kata(text)[0])
24
- if style_text:
25
- style_text = "".join(text2sep_kata(style_text)[0])
26
- if (
27
- sys.platform == "darwin"
28
- and torch.backends.mps.is_available()
29
- and device == "cpu"
30
- ):
31
- device = "mps"
32
- if not device:
33
- device = "cuda"
34
- if device not in models.keys():
35
- models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
36
- with torch.no_grad():
37
- inputs = tokenizer(text, return_tensors="pt")
38
- for i in inputs:
39
- inputs[i] = inputs[i].to(device)
40
- res = models[device](**inputs, output_hidden_states=True)
41
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
42
- if style_text:
43
- style_inputs = tokenizer(style_text, return_tensors="pt")
44
- for i in style_inputs:
45
- style_inputs[i] = style_inputs[i].to(device)
46
- style_res = models[device](**style_inputs, output_hidden_states=True)
47
- style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
48
- style_res_mean = style_res.mean(0)
49
-
50
- assert len(word2ph) == len(text) + 2
51
- word2phone = word2ph
52
- phone_level_feature = []
53
- for i in range(len(word2phone)):
54
- if style_text:
55
- repeat_feature = (
56
- res[i].repeat(word2phone[i], 1) * (1 - style_weight)
57
- + style_res_mean.repeat(word2phone[i], 1) * style_weight
58
- )
59
- else:
60
- repeat_feature = res[i].repeat(word2phone[i], 1)
61
- phone_level_feature.append(repeat_feature)
62
-
63
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
64
-
65
- return phone_level_feature.T
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/opencpop-strict.txt DELETED
@@ -1,429 +0,0 @@
1
- a AA a
2
- ai AA ai
3
- an AA an
4
- ang AA ang
5
- ao AA ao
6
- ba b a
7
- bai b ai
8
- ban b an
9
- bang b ang
10
- bao b ao
11
- bei b ei
12
- ben b en
13
- beng b eng
14
- bi b i
15
- bian b ian
16
- biao b iao
17
- bie b ie
18
- bin b in
19
- bing b ing
20
- bo b o
21
- bu b u
22
- ca c a
23
- cai c ai
24
- can c an
25
- cang c ang
26
- cao c ao
27
- ce c e
28
- cei c ei
29
- cen c en
30
- ceng c eng
31
- cha ch a
32
- chai ch ai
33
- chan ch an
34
- chang ch ang
35
- chao ch ao
36
- che ch e
37
- chen ch en
38
- cheng ch eng
39
- chi ch ir
40
- chong ch ong
41
- chou ch ou
42
- chu ch u
43
- chua ch ua
44
- chuai ch uai
45
- chuan ch uan
46
- chuang ch uang
47
- chui ch ui
48
- chun ch un
49
- chuo ch uo
50
- ci c i0
51
- cong c ong
52
- cou c ou
53
- cu c u
54
- cuan c uan
55
- cui c ui
56
- cun c un
57
- cuo c uo
58
- da d a
59
- dai d ai
60
- dan d an
61
- dang d ang
62
- dao d ao
63
- de d e
64
- dei d ei
65
- den d en
66
- deng d eng
67
- di d i
68
- dia d ia
69
- dian d ian
70
- diao d iao
71
- die d ie
72
- ding d ing
73
- diu d iu
74
- dong d ong
75
- dou d ou
76
- du d u
77
- duan d uan
78
- dui d ui
79
- dun d un
80
- duo d uo
81
- e EE e
82
- ei EE ei
83
- en EE en
84
- eng EE eng
85
- er EE er
86
- fa f a
87
- fan f an
88
- fang f ang
89
- fei f ei
90
- fen f en
91
- feng f eng
92
- fo f o
93
- fou f ou
94
- fu f u
95
- ga g a
96
- gai g ai
97
- gan g an
98
- gang g ang
99
- gao g ao
100
- ge g e
101
- gei g ei
102
- gen g en
103
- geng g eng
104
- gong g ong
105
- gou g ou
106
- gu g u
107
- gua g ua
108
- guai g uai
109
- guan g uan
110
- guang g uang
111
- gui g ui
112
- gun g un
113
- guo g uo
114
- ha h a
115
- hai h ai
116
- han h an
117
- hang h ang
118
- hao h ao
119
- he h e
120
- hei h ei
121
- hen h en
122
- heng h eng
123
- hong h ong
124
- hou h ou
125
- hu h u
126
- hua h ua
127
- huai h uai
128
- huan h uan
129
- huang h uang
130
- hui h ui
131
- hun h un
132
- huo h uo
133
- ji j i
134
- jia j ia
135
- jian j ian
136
- jiang j iang
137
- jiao j iao
138
- jie j ie
139
- jin j in
140
- jing j ing
141
- jiong j iong
142
- jiu j iu
143
- ju j v
144
- jv j v
145
- juan j van
146
- jvan j van
147
- jue j ve
148
- jve j ve
149
- jun j vn
150
- jvn j vn
151
- ka k a
152
- kai k ai
153
- kan k an
154
- kang k ang
155
- kao k ao
156
- ke k e
157
- kei k ei
158
- ken k en
159
- keng k eng
160
- kong k ong
161
- kou k ou
162
- ku k u
163
- kua k ua
164
- kuai k uai
165
- kuan k uan
166
- kuang k uang
167
- kui k ui
168
- kun k un
169
- kuo k uo
170
- la l a
171
- lai l ai
172
- lan l an
173
- lang l ang
174
- lao l ao
175
- le l e
176
- lei l ei
177
- leng l eng
178
- li l i
179
- lia l ia
180
- lian l ian
181
- liang l iang
182
- liao l iao
183
- lie l ie
184
- lin l in
185
- ling l ing
186
- liu l iu
187
- lo l o
188
- long l ong
189
- lou l ou
190
- lu l u
191
- luan l uan
192
- lun l un
193
- luo l uo
194
- lv l v
195
- lve l ve
196
- ma m a
197
- mai m ai
198
- man m an
199
- mang m ang
200
- mao m ao
201
- me m e
202
- mei m ei
203
- men m en
204
- meng m eng
205
- mi m i
206
- mian m ian
207
- miao m iao
208
- mie m ie
209
- min m in
210
- ming m ing
211
- miu m iu
212
- mo m o
213
- mou m ou
214
- mu m u
215
- na n a
216
- nai n ai
217
- nan n an
218
- nang n ang
219
- nao n ao
220
- ne n e
221
- nei n ei
222
- nen n en
223
- neng n eng
224
- ni n i
225
- nian n ian
226
- niang n iang
227
- niao n iao
228
- nie n ie
229
- nin n in
230
- ning n ing
231
- niu n iu
232
- nong n ong
233
- nou n ou
234
- nu n u
235
- nuan n uan
236
- nun n un
237
- nuo n uo
238
- nv n v
239
- nve n ve
240
- o OO o
241
- ou OO ou
242
- pa p a
243
- pai p ai
244
- pan p an
245
- pang p ang
246
- pao p ao
247
- pei p ei
248
- pen p en
249
- peng p eng
250
- pi p i
251
- pian p ian
252
- piao p iao
253
- pie p ie
254
- pin p in
255
- ping p ing
256
- po p o
257
- pou p ou
258
- pu p u
259
- qi q i
260
- qia q ia
261
- qian q ian
262
- qiang q iang
263
- qiao q iao
264
- qie q ie
265
- qin q in
266
- qing q ing
267
- qiong q iong
268
- qiu q iu
269
- qu q v
270
- qv q v
271
- quan q van
272
- qvan q van
273
- que q ve
274
- qve q ve
275
- qun q vn
276
- qvn q vn
277
- ran r an
278
- rang r ang
279
- rao r ao
280
- re r e
281
- ren r en
282
- reng r eng
283
- ri r ir
284
- rong r ong
285
- rou r ou
286
- ru r u
287
- rua r ua
288
- ruan r uan
289
- rui r ui
290
- run r un
291
- ruo r uo
292
- sa s a
293
- sai s ai
294
- san s an
295
- sang s ang
296
- sao s ao
297
- se s e
298
- sen s en
299
- seng s eng
300
- sha sh a
301
- shai sh ai
302
- shan sh an
303
- shang sh ang
304
- shao sh ao
305
- she sh e
306
- shei sh ei
307
- shen sh en
308
- sheng sh eng
309
- shi sh ir
310
- shou sh ou
311
- shu sh u
312
- shua sh ua
313
- shuai sh uai
314
- shuan sh uan
315
- shuang sh uang
316
- shui sh ui
317
- shun sh un
318
- shuo sh uo
319
- si s i0
320
- song s ong
321
- sou s ou
322
- su s u
323
- suan s uan
324
- sui s ui
325
- sun s un
326
- suo s uo
327
- ta t a
328
- tai t ai
329
- tan t an
330
- tang t ang
331
- tao t ao
332
- te t e
333
- tei t ei
334
- teng t eng
335
- ti t i
336
- tian t ian
337
- tiao t iao
338
- tie t ie
339
- ting t ing
340
- tong t ong
341
- tou t ou
342
- tu t u
343
- tuan t uan
344
- tui t ui
345
- tun t un
346
- tuo t uo
347
- wa w a
348
- wai w ai
349
- wan w an
350
- wang w ang
351
- wei w ei
352
- wen w en
353
- weng w eng
354
- wo w o
355
- wu w u
356
- xi x i
357
- xia x ia
358
- xian x ian
359
- xiang x iang
360
- xiao x iao
361
- xie x ie
362
- xin x in
363
- xing x ing
364
- xiong x iong
365
- xiu x iu
366
- xu x v
367
- xv x v
368
- xuan x van
369
- xvan x van
370
- xue x ve
371
- xve x ve
372
- xun x vn
373
- xvn x vn
374
- ya y a
375
- yan y En
376
- yang y ang
377
- yao y ao
378
- ye y E
379
- yi y i
380
- yin y in
381
- ying y ing
382
- yo y o
383
- yong y ong
384
- you y ou
385
- yu y v
386
- yv y v
387
- yuan y van
388
- yvan y van
389
- yue y ve
390
- yve y ve
391
- yun y vn
392
- yvn y vn
393
- za z a
394
- zai z ai
395
- zan z an
396
- zang z ang
397
- zao z ao
398
- ze z e
399
- zei z ei
400
- zen z en
401
- zeng z eng
402
- zha zh a
403
- zhai zh ai
404
- zhan zh an
405
- zhang zh ang
406
- zhao zh ao
407
- zhe zh e
408
- zhei zh ei
409
- zhen zh en
410
- zheng zh eng
411
- zhi zh ir
412
- zhong zh ong
413
- zhou zh ou
414
- zhu zh u
415
- zhua zh ua
416
- zhuai zh uai
417
- zhuan zh uan
418
- zhuang zh uang
419
- zhui zh ui
420
- zhun zh un
421
- zhuo zh uo
422
- zi z i0
423
- zong z ong
424
- zou z ou
425
- zu z u
426
- zuan z uan
427
- zui z ui
428
- zun z un
429
- zuo z uo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/symbols.py DELETED
@@ -1,187 +0,0 @@
1
- punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2
- pu_symbols = punctuation + ["SP", "UNK"]
3
- pad = "_"
4
-
5
- # chinese
6
- zh_symbols = [
7
- "E",
8
- "En",
9
- "a",
10
- "ai",
11
- "an",
12
- "ang",
13
- "ao",
14
- "b",
15
- "c",
16
- "ch",
17
- "d",
18
- "e",
19
- "ei",
20
- "en",
21
- "eng",
22
- "er",
23
- "f",
24
- "g",
25
- "h",
26
- "i",
27
- "i0",
28
- "ia",
29
- "ian",
30
- "iang",
31
- "iao",
32
- "ie",
33
- "in",
34
- "ing",
35
- "iong",
36
- "ir",
37
- "iu",
38
- "j",
39
- "k",
40
- "l",
41
- "m",
42
- "n",
43
- "o",
44
- "ong",
45
- "ou",
46
- "p",
47
- "q",
48
- "r",
49
- "s",
50
- "sh",
51
- "t",
52
- "u",
53
- "ua",
54
- "uai",
55
- "uan",
56
- "uang",
57
- "ui",
58
- "un",
59
- "uo",
60
- "v",
61
- "van",
62
- "ve",
63
- "vn",
64
- "w",
65
- "x",
66
- "y",
67
- "z",
68
- "zh",
69
- "AA",
70
- "EE",
71
- "OO",
72
- ]
73
- num_zh_tones = 6
74
-
75
- # japanese
76
- ja_symbols = [
77
- "N",
78
- "a",
79
- "a:",
80
- "b",
81
- "by",
82
- "ch",
83
- "d",
84
- "dy",
85
- "e",
86
- "e:",
87
- "f",
88
- "g",
89
- "gy",
90
- "h",
91
- "hy",
92
- "i",
93
- "i:",
94
- "j",
95
- "k",
96
- "ky",
97
- "m",
98
- "my",
99
- "n",
100
- "ny",
101
- "o",
102
- "o:",
103
- "p",
104
- "py",
105
- "q",
106
- "r",
107
- "ry",
108
- "s",
109
- "sh",
110
- "t",
111
- "ts",
112
- "ty",
113
- "u",
114
- "u:",
115
- "w",
116
- "y",
117
- "z",
118
- "zy",
119
- ]
120
- num_ja_tones = 2
121
-
122
- # English
123
- en_symbols = [
124
- "aa",
125
- "ae",
126
- "ah",
127
- "ao",
128
- "aw",
129
- "ay",
130
- "b",
131
- "ch",
132
- "d",
133
- "dh",
134
- "eh",
135
- "er",
136
- "ey",
137
- "f",
138
- "g",
139
- "hh",
140
- "ih",
141
- "iy",
142
- "jh",
143
- "k",
144
- "l",
145
- "m",
146
- "n",
147
- "ng",
148
- "ow",
149
- "oy",
150
- "p",
151
- "r",
152
- "s",
153
- "sh",
154
- "t",
155
- "th",
156
- "uh",
157
- "uw",
158
- "V",
159
- "w",
160
- "y",
161
- "z",
162
- "zh",
163
- ]
164
- num_en_tones = 4
165
-
166
- # combine all symbols
167
- normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168
- symbols = [pad] + normal_symbols + pu_symbols
169
- sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170
-
171
- # combine all tones
172
- num_tones = num_zh_tones + num_ja_tones + num_en_tones
173
-
174
- # language maps
175
- language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176
- num_languages = len(language_id_map.keys())
177
-
178
- language_tone_start_map = {
179
- "ZH": 0,
180
- "JP": num_zh_tones,
181
- "EN": num_zh_tones + num_ja_tones,
182
- }
183
-
184
- if __name__ == "__main__":
185
- a = set(zh_symbols)
186
- b = set(en_symbols)
187
- print(sorted(a & b))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/tone_sandhi.py DELETED
@@ -1,773 +0,0 @@
1
- # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- from typing import List
15
- from typing import Tuple
16
-
17
- import jieba
18
- from pypinyin import lazy_pinyin
19
- from pypinyin import Style
20
-
21
-
22
- class ToneSandhi:
23
- def __init__(self):
24
- self.must_neural_tone_words = {
25
- "麻烦",
26
- "麻利",
27
- "鸳鸯",
28
- "高粱",
29
- "骨头",
30
- "骆驼",
31
- "马虎",
32
- "首饰",
33
- "馒头",
34
- "馄饨",
35
- "风筝",
36
- "难为",
37
- "队伍",
38
- "阔气",
39
- "闺女",
40
- "门道",
41
- "锄头",
42
- "铺盖",
43
- "铃铛",
44
- "铁匠",
45
- "钥匙",
46
- "里脊",
47
- "里头",
48
- "部分",
49
- "那么",
50
- "道士",
51
- "造化",
52
- "迷糊",
53
- "连累",
54
- "这么",
55
- "这个",
56
- "运气",
57
- "过去",
58
- "软和",
59
- "转悠",
60
- "踏实",
61
- "跳蚤",
62
- "跟头",
63
- "趔趄",
64
- "财主",
65
- "豆腐",
66
- "讲究",
67
- "记性",
68
- "记号",
69
- "认识",
70
- "规矩",
71
- "见识",
72
- "裁缝",
73
- "补丁",
74
- "衣裳",
75
- "衣服",
76
- "衙门",
77
- "街坊",
78
- "行李",
79
- "行当",
80
- "蛤蟆",
81
- "蘑菇",
82
- "薄荷",
83
- "葫芦",
84
- "葡萄",
85
- "萝卜",
86
- "荸荠",
87
- "苗条",
88
- "苗头",
89
- "苍蝇",
90
- "芝麻",
91
- "舒服",
92
- "舒坦",
93
- "舌头",
94
- "自在",
95
- "膏药",
96
- "脾气",
97
- "脑袋",
98
- "脊梁",
99
- "能耐",
100
- "胳膊",
101
- "胭脂",
102
- "胡萝",
103
- "胡琴",
104
- "胡同",
105
- "聪明",
106
- "耽误",
107
- "耽搁",
108
- "耷拉",
109
- "耳朵",
110
- "老爷",
111
- "老实",
112
- "老婆",
113
- "老头",
114
- "老太",
115
- "翻腾",
116
- "罗嗦",
117
- "罐头",
118
- "编辑",
119
- "结实",
120
- "红火",
121
- "累赘",
122
- "糨糊",
123
- "糊涂",
124
- "精神",
125
- "粮食",
126
- "簸箕",
127
- "篱笆",
128
- "算计",
129
- "算盘",
130
- "答应",
131
- "笤帚",
132
- "笑语",
133
- "笑话",
134
- "窟窿",
135
- "窝囊",
136
- "窗户",
137
- "稳当",
138
- "稀罕",
139
- "称呼",
140
- "秧歌",
141
- "秀气",
142
- "秀才",
143
- "福气",
144
- "祖宗",
145
- "砚台",
146
- "码头",
147
- "石榴",
148
- "石头",
149
- "石匠",
150
- "知识",
151
- "眼睛",
152
- "眯缝",
153
- "眨巴",
154
- "眉毛",
155
- "相声",
156
- "盘算",
157
- "白净",
158
- "痢疾",
159
- "痛快",
160
- "疟疾",
161
- "疙瘩",
162
- "疏忽",
163
- "畜生",
164
- "生意",
165
- "甘蔗",
166
- "琵琶",
167
- "琢磨",
168
- "琉璃",
169
- "玻璃",
170
- "玫瑰",
171
- "玄乎",
172
- "狐狸",
173
- "状元",
174
- "特务",
175
- "牲口",
176
- "牙碜",
177
- "牌楼",
178
- "爽快",
179
- "爱人",
180
- "热闹",
181
- "烧饼",
182
- "烟筒",
183
- "烂糊",
184
- "点心",
185
- "炊帚",
186
- "灯笼",
187
- "火候",
188
- "漂亮",
189
- "滑溜",
190
- "溜达",
191
- "温和",
192
- "清楚",
193
- "消息",
194
- "浪头",
195
- "活泼",
196
- "比方",
197
- "正经",
198
- "欺负",
199
- "模糊",
200
- "槟榔",
201
- "棺材",
202
- "棒槌",
203
- "棉花",
204
- "核桃",
205
- "栅栏",
206
- "柴火",
207
- "架势",
208
- "枕头",
209
- "枇杷",
210
- "机灵",
211
- "本事",
212
- "木头",
213
- "木匠",
214
- "朋友",
215
- "月饼",
216
- "月亮",
217
- "暖和",
218
- "明白",
219
- "时候",
220
- "新鲜",
221
- "故事",
222
- "收拾",
223
- "收成",
224
- "提防",
225
- "挖苦",
226
- "挑剔",
227
- "指甲",
228
- "指头",
229
- "拾掇",
230
- "拳头",
231
- "拨弄",
232
- "招牌",
233
- "招呼",
234
- "抬举",
235
- "护士",
236
- "折腾",
237
- "扫帚",
238
- "打量",
239
- "打算",
240
- "打点",
241
- "打扮",
242
- "打听",
243
- "打发",
244
- "扎实",
245
- "扁担",
246
- "戒指",
247
- "懒得",
248
- "意识",
249
- "意思",
250
- "情形",
251
- "悟性",
252
- "怪物",
253
- "思量",
254
- "怎么",
255
- "念头",
256
- "念叨",
257
- "快活",
258
- "忙活",
259
- "志气",
260
- "心思",
261
- "得罪",
262
- "张罗",
263
- "弟兄",
264
- "开通",
265
- "应酬",
266
- "庄稼",
267
- "干事",
268
- "帮手",
269
- "帐篷",
270
- "希罕",
271
- "师父",
272
- "师傅",
273
- "巴结",
274
- "巴掌",
275
- "差事",
276
- "工夫",
277
- "岁数",
278
- "屁股",
279
- "尾巴",
280
- "少爷",
281
- "小气",
282
- "小伙",
283
- "将就",
284
- "对头",
285
- "对付",
286
- "寡妇",
287
- "家伙",
288
- "客气",
289
- "实在",
290
- "官司",
291
- "学问",
292
- "学生",
293
- "字号",
294
- "嫁妆",
295
- "媳妇",
296
- "媒人",
297
- "婆家",
298
- "娘家",
299
- "委屈",
300
- "姑娘",
301
- "姐夫",
302
- "妯娌",
303
- "妥当",
304
- "妖精",
305
- "奴才",
306
- "女婿",
307
- "头发",
308
- "太阳",
309
- "大爷",
310
- "大方",
311
- "大意",
312
- "大夫",
313
- "多少",
314
- "多么",
315
- "外甥",
316
- "壮实",
317
- "地道",
318
- "地方",
319
- "在乎",
320
- "困难",
321
- "嘴巴",
322
- "嘱咐",
323
- "嘟囔",
324
- "嘀咕",
325
- "喜欢",
326
- "喇嘛",
327
- "喇叭",
328
- "商量",
329
- "唾沫",
330
- "哑巴",
331
- "哈欠",
332
- "哆嗦",
333
- "咳嗽",
334
- "和尚",
335
- "告诉",
336
- "告示",
337
- "含糊",
338
- "吓唬",
339
- "后头",
340
- "名字",
341
- "名堂",
342
- "合同",
343
- "吆喝",
344
- "叫唤",
345
- "口袋",
346
- "厚道",
347
- "厉害",
348
- "千斤",
349
- "包袱",
350
- "包涵",
351
- "匀称",
352
- "勤快",
353
- "动静",
354
- "动弹",
355
- "功夫",
356
- "力气",
357
- "前头",
358
- "刺猬",
359
- "刺激",
360
- "别扭",
361
- "利落",
362
- "利索",
363
- "利害",
364
- "分析",
365
- "出息",
366
- "凑合",
367
- "凉快",
368
- "冷战",
369
- "冤枉",
370
- "冒失",
371
- "养活",
372
- "关系",
373
- "先生",
374
- "兄弟",
375
- "便宜",
376
- "使唤",
377
- "佩服",
378
- "作坊",
379
- "体面",
380
- "位置",
381
- "似的",
382
- "伙计",
383
- "休息",
384
- "什么",
385
- "人家",
386
- "亲戚",
387
- "亲家",
388
- "交情",
389
- "云彩",
390
- "事情",
391
- "买卖",
392
- "主意",
393
- "丫头",
394
- "丧气",
395
- "两口",
396
- "东西",
397
- "东家",
398
- "世故",
399
- "不由",
400
- "不在",
401
- "下水",
402
- "下巴",
403
- "上头",
404
- "上司",
405
- "丈夫",
406
- "丈人",
407
- "一辈",
408
- "那个",
409
- "菩萨",
410
- "父亲",
411
- "母亲",
412
- "咕噜",
413
- "邋遢",
414
- "费用",
415
- "冤家",
416
- "甜头",
417
- "介绍",
418
- "荒唐",
419
- "大人",
420
- "泥鳅",
421
- "幸福",
422
- "熟悉",
423
- "计划",
424
- "扑腾",
425
- "蜡烛",
426
- "姥爷",
427
- "照顾",
428
- "喉咙",
429
- "吉他",
430
- "弄堂",
431
- "蚂蚱",
432
- "凤凰",
433
- "拖沓",
434
- "寒碜",
435
- "糟蹋",
436
- "倒腾",
437
- "报复",
438
- "逻辑",
439
- "盘缠",
440
- "喽啰",
441
- "牢骚",
442
- "咖喱",
443
- "扫把",
444
- "惦记",
445
- }
446
- self.must_not_neural_tone_words = {
447
- "男子",
448
- "女子",
449
- "分子",
450
- "原子",
451
- "量子",
452
- "莲子",
453
- "石子",
454
- "瓜子",
455
- "电子",
456
- "人人",
457
- "虎虎",
458
- }
459
- self.punc = ":,;。?!“”‘’':,;.?!"
460
-
461
- # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
462
- # e.g.
463
- # word: "家里"
464
- # pos: "s"
465
- # finals: ['ia1', 'i3']
466
- def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
467
- # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
468
- for j, item in enumerate(word):
469
- if (
470
- j - 1 >= 0
471
- and item == word[j - 1]
472
- and pos[0] in {"n", "v", "a"}
473
- and word not in self.must_not_neural_tone_words
474
- ):
475
- finals[j] = finals[j][:-1] + "5"
476
- ge_idx = word.find("个")
477
- if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
478
- finals[-1] = finals[-1][:-1] + "5"
479
- elif len(word) >= 1 and word[-1] in "的地得":
480
- finals[-1] = finals[-1][:-1] + "5"
481
- # e.g. 走了, 看着, 去过
482
- # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
483
- # finals[-1] = finals[-1][:-1] + "5"
484
- elif (
485
- len(word) > 1
486
- and word[-1] in "们子"
487
- and pos in {"r", "n"}
488
- and word not in self.must_not_neural_tone_words
489
- ):
490
- finals[-1] = finals[-1][:-1] + "5"
491
- # e.g. 桌上, 地下, 家里
492
- elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
493
- finals[-1] = finals[-1][:-1] + "5"
494
- # e.g. 上来, 下去
495
- elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
496
- finals[-1] = finals[-1][:-1] + "5"
497
- # 个做量词
498
- elif (
499
- ge_idx >= 1
500
- and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
501
- ) or word == "个":
502
- finals[ge_idx] = finals[ge_idx][:-1] + "5"
503
- else:
504
- if (
505
- word in self.must_neural_tone_words
506
- or word[-2:] in self.must_neural_tone_words
507
- ):
508
- finals[-1] = finals[-1][:-1] + "5"
509
-
510
- word_list = self._split_word(word)
511
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
512
- for i, word in enumerate(word_list):
513
- # conventional neural in Chinese
514
- if (
515
- word in self.must_neural_tone_words
516
- or word[-2:] in self.must_neural_tone_words
517
- ):
518
- finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
519
- finals = sum(finals_list, [])
520
- return finals
521
-
522
- def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
523
- # e.g. 看不懂
524
- if len(word) == 3 and word[1] == "不":
525
- finals[1] = finals[1][:-1] + "5"
526
- else:
527
- for i, char in enumerate(word):
528
- # "不" before tone4 should be bu2, e.g. 不怕
529
- if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
530
- finals[i] = finals[i][:-1] + "2"
531
- return finals
532
-
533
- def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
534
- # "一" in number sequences, e.g. 一零零, 二一零
535
- if word.find("一") != -1 and all(
536
- [item.isnumeric() for item in word if item != "一"]
537
- ):
538
- return finals
539
- # "一" between reduplication words should be yi5, e.g. 看一看
540
- elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
541
- finals[1] = finals[1][:-1] + "5"
542
- # when "一" is ordinal word, it should be yi1
543
- elif word.startswith("第一"):
544
- finals[1] = finals[1][:-1] + "1"
545
- else:
546
- for i, char in enumerate(word):
547
- if char == "一" and i + 1 < len(word):
548
- # "一" before tone4 should be yi2, e.g. 一段
549
- if finals[i + 1][-1] == "4":
550
- finals[i] = finals[i][:-1] + "2"
551
- # "一" before non-tone4 should be yi4, e.g. 一天
552
- else:
553
- # "一" 后面如果是标点,还读一声
554
- if word[i + 1] not in self.punc:
555
- finals[i] = finals[i][:-1] + "4"
556
- return finals
557
-
558
- def _split_word(self, word: str) -> List[str]:
559
- word_list = jieba.cut_for_search(word)
560
- word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
561
- first_subword = word_list[0]
562
- first_begin_idx = word.find(first_subword)
563
- if first_begin_idx == 0:
564
- second_subword = word[len(first_subword) :]
565
- new_word_list = [first_subword, second_subword]
566
- else:
567
- second_subword = word[: -len(first_subword)]
568
- new_word_list = [second_subword, first_subword]
569
- return new_word_list
570
-
571
- def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
572
- if len(word) == 2 and self._all_tone_three(finals):
573
- finals[0] = finals[0][:-1] + "2"
574
- elif len(word) == 3:
575
- word_list = self._split_word(word)
576
- if self._all_tone_three(finals):
577
- # disyllabic + monosyllabic, e.g. 蒙古/包
578
- if len(word_list[0]) == 2:
579
- finals[0] = finals[0][:-1] + "2"
580
- finals[1] = finals[1][:-1] + "2"
581
- # monosyllabic + disyllabic, e.g. 纸/老虎
582
- elif len(word_list[0]) == 1:
583
- finals[1] = finals[1][:-1] + "2"
584
- else:
585
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
586
- if len(finals_list) == 2:
587
- for i, sub in enumerate(finals_list):
588
- # e.g. 所有/人
589
- if self._all_tone_three(sub) and len(sub) == 2:
590
- finals_list[i][0] = finals_list[i][0][:-1] + "2"
591
- # e.g. 好/喜欢
592
- elif (
593
- i == 1
594
- and not self._all_tone_three(sub)
595
- and finals_list[i][0][-1] == "3"
596
- and finals_list[0][-1][-1] == "3"
597
- ):
598
- finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
599
- finals = sum(finals_list, [])
600
- # split idiom into two words who's length is 2
601
- elif len(word) == 4:
602
- finals_list = [finals[:2], finals[2:]]
603
- finals = []
604
- for sub in finals_list:
605
- if self._all_tone_three(sub):
606
- sub[0] = sub[0][:-1] + "2"
607
- finals += sub
608
-
609
- return finals
610
-
611
- def _all_tone_three(self, finals: List[str]) -> bool:
612
- return all(x[-1] == "3" for x in finals)
613
-
614
- # merge "不" and the word behind it
615
- # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
616
- def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
617
- new_seg = []
618
- last_word = ""
619
- for word, pos in seg:
620
- if last_word == "不":
621
- word = last_word + word
622
- if word != "不":
623
- new_seg.append((word, pos))
624
- last_word = word[:]
625
- if last_word == "不":
626
- new_seg.append((last_word, "d"))
627
- last_word = ""
628
- return new_seg
629
-
630
- # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
631
- # function 2: merge single "一" and the word behind it
632
- # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
633
- # e.g.
634
- # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
635
- # output seg: [['听一听', 'v']]
636
- def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
637
- new_seg = [] * len(seg)
638
- # function 1
639
- i = 0
640
- while i < len(seg):
641
- word, pos = seg[i]
642
- if (
643
- i - 1 >= 0
644
- and word == "一"
645
- and i + 1 < len(seg)
646
- and seg[i - 1][0] == seg[i + 1][0]
647
- and seg[i - 1][1] == "v"
648
- ):
649
- new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
650
- i += 2
651
- else:
652
- if (
653
- i - 2 >= 0
654
- and seg[i - 1][0] == "一"
655
- and seg[i - 2][0] == word
656
- and pos == "v"
657
- ):
658
- continue
659
- else:
660
- new_seg.append([word, pos])
661
- i += 1
662
- seg = [i for i in new_seg if len(i) > 0]
663
- new_seg = []
664
- # function 2
665
- for i, (word, pos) in enumerate(seg):
666
- if new_seg and new_seg[-1][0] == "一":
667
- new_seg[-1][0] = new_seg[-1][0] + word
668
- else:
669
- new_seg.append([word, pos])
670
- return new_seg
671
-
672
- # the first and the second words are all_tone_three
673
- def _merge_continuous_three_tones(
674
- self, seg: List[Tuple[str, str]]
675
- ) -> List[Tuple[str, str]]:
676
- new_seg = []
677
- sub_finals_list = [
678
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
679
- for (word, pos) in seg
680
- ]
681
- assert len(sub_finals_list) == len(seg)
682
- merge_last = [False] * len(seg)
683
- for i, (word, pos) in enumerate(seg):
684
- if (
685
- i - 1 >= 0
686
- and self._all_tone_three(sub_finals_list[i - 1])
687
- and self._all_tone_three(sub_finals_list[i])
688
- and not merge_last[i - 1]
689
- ):
690
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
691
- if (
692
- not self._is_reduplication(seg[i - 1][0])
693
- and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
694
- ):
695
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
696
- merge_last[i] = True
697
- else:
698
- new_seg.append([word, pos])
699
- else:
700
- new_seg.append([word, pos])
701
-
702
- return new_seg
703
-
704
- def _is_reduplication(self, word: str) -> bool:
705
- return len(word) == 2 and word[0] == word[1]
706
-
707
- # the last char of first word and the first char of second word is tone_three
708
- def _merge_continuous_three_tones_2(
709
- self, seg: List[Tuple[str, str]]
710
- ) -> List[Tuple[str, str]]:
711
- new_seg = []
712
- sub_finals_list = [
713
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
714
- for (word, pos) in seg
715
- ]
716
- assert len(sub_finals_list) == len(seg)
717
- merge_last = [False] * len(seg)
718
- for i, (word, pos) in enumerate(seg):
719
- if (
720
- i - 1 >= 0
721
- and sub_finals_list[i - 1][-1][-1] == "3"
722
- and sub_finals_list[i][0][-1] == "3"
723
- and not merge_last[i - 1]
724
- ):
725
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
726
- if (
727
- not self._is_reduplication(seg[i - 1][0])
728
- and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
729
- ):
730
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
731
- merge_last[i] = True
732
- else:
733
- new_seg.append([word, pos])
734
- else:
735
- new_seg.append([word, pos])
736
- return new_seg
737
-
738
- def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
739
- new_seg = []
740
- for i, (word, pos) in enumerate(seg):
741
- if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
742
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
743
- else:
744
- new_seg.append([word, pos])
745
- return new_seg
746
-
747
- def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
748
- new_seg = []
749
- for i, (word, pos) in enumerate(seg):
750
- if new_seg and word == new_seg[-1][0]:
751
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
752
- else:
753
- new_seg.append([word, pos])
754
- return new_seg
755
-
756
- def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
757
- seg = self._merge_bu(seg)
758
- try:
759
- seg = self._merge_yi(seg)
760
- except:
761
- print("_merge_yi failed")
762
- seg = self._merge_reduplication(seg)
763
- seg = self._merge_continuous_three_tones(seg)
764
- seg = self._merge_continuous_three_tones_2(seg)
765
- seg = self._merge_er(seg)
766
- return seg
767
-
768
- def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
769
- finals = self._bu_sandhi(word, finals)
770
- finals = self._yi_sandhi(word, finals)
771
- finals = self._neural_sandhi(word, pos, finals)
772
- finals = self._three_sandhi(word, finals)
773
- return finals