# coding: utf8 import re from normalize import chuan_hoa_dau_tu_tieng_viet import numpy as np from tqdm import tqdm import textdistance import json from copy import copy with open("common-vietnamese-syllables.txt", "r", encoding="utf-8") as file: vi_syllables = [line.strip("\n") for line in file.readlines()] vi_syllables_new = [] for syllable in vi_syllables: normalized = chuan_hoa_dau_tu_tieng_viet(syllable) vi_syllables_new.append(normalized) regex_nguyen_am_don = "ộ|ặ|ằ|ụ|ầ|a|ũ|á|ể|ỡ|ủ|y|ở|ế|ẵ|ệ|é|ẹ|â|ề|ê|ọ|ờ|ẳ|ợ|ỷ|ữ|ị|e|u|ò|ẫ|i|ỉ|ẩ|ẽ|õ|ỹ|ô|ỵ|ồ|ú|í|ó|ỗ|ã|ẻ|ù|ă|ơ|ứ|ậ|ử|ừ|à|ĩ|ả|ố|ớ|ự|ắ|o|ý|ỳ|ư|ấ|ễ|ạ|ỏ|ổ|è|ì" regex_nguyen_am_doi = "uằ|iê|ấu|ượ|ùy|ạy|uỹ|ươ|ỗi|yệ|ụy|ẫy|oà|ái|ói|uồ|uỷ|oỏ|ệu|ue|oi|ậu|oè|uã|ãi|òi|ơi|ựa|ụi|iể|oá|ìa|ĩu|uẹ|ìu|ầu|ỏe|ối|uẳ|ịa|òe|ai|ọe|yể|ày|ỉu|uỵ|uể|óe|ỉa|ũa|ườ|uè|êu|ẹo|uá|ỏi|uấ|ưỡ|ội|au|iề|ửu|ọi|ảu|uẽ|ầy|ẻo|ao|yế|uẻ|uơ|ưở|iế|uở|ịu|ủa|ẫu|uặ|oằ|oò|ạu|uỳ|ạo|oọ|ưa|oẹ|ui|uậ|ủi|áo|óa|ẩu|ảy|oẵ|áu|ựu|uô|ửa|ễu|uâ|oạ|uổ|uê|ùi|ếu|ời|iu|uo|oé|yễ|oẳ|uớ|ay|iễ|ủy|ướ|oó|eo|ũi|oả|ua|ỏa|ấy|uố|èo|oo|úy|ẩy|ồi|yề|ẽo|uẫ|ứu|ãy|ổi|ía|ảo|ué|uờ|ùa|ia|ều|oa|iệ|àu|õa|oắ|uắ|uả|ứa|ởi|ụa|ũy|òa|íu|éo|oã|uă|uộ|ữu|úa|ải|ỡi|ừu|ểu|oe|õi|ọa|ừa|uệ|uý|uó|ào|uà|ây|oă|uạ|ữa|oặ|uy|ợi|uẩ|uỗ|ão|uế|ưu|ửi|ại|âu|ới|uầ|ĩa|úi|oẻ|ôi|ài|uề|yê|ậy|áy" regex_nguyen_am_ba = "uỷu|uây|ươu|iệu|yếu|yểu|uyế|uyệ|uyề|ưỡi|uôi|ượi|uổi|oay|uào|iễu|oeo|oèo|uỗi|oai|uấy|oái|uỵu|uyể|uồi|oáy|yều|oẹo|uẫy|ưởi|iểu|uầy|iêu|uối|uyễ|ưới|iều|oài|uao|ươi|yêu|ười|uya|oải|ướu|uội|oại|iếu|ượu|uẩy|uyê|uậy" all_phu_am_dau = {'', 'gh', 'q', 'kh', 'p', 'm', 'qu', 'n', 'b', 'g', 't', 'ch', 'th', 'k', 'đ', 'r', 'ph', 'ngh', 'gi', 'tr', 's', 'l', 'h', 'nh', 'c', 'ng', 'd', 'v', 'x'} all_phu_am_cuoi = {'', 'ng', 'nh', 't', 'ch', 'c', 'p', 'm', 'k', 'n'} all_nguyen_am_don = "ộ|ặ|ằ|ụ|ầ|a|ũ|á|ể|ỡ|ủ|y|ở|ế|ẵ|ệ|é|ẹ|â|ề|ê|ọ|ờ|ẳ|ợ|ỷ|ữ|ị|e|u|ò|ẫ|i|ỉ|ẩ|ẽ|õ|ỹ|ô|ỵ|ồ|ú|í|ó|ỗ|ã|ẻ|ù|ă|ơ|ứ|ậ|ử|ừ|à|ĩ|ả|ố|ớ|ự|ắ|o|ý|ỳ|ư|ấ|ễ|ạ|ỏ|ổ|è|ì".split("|") all_nguyen_am_doi = "uằ|iê|ấu|ượ|ùy|ạy|uỹ|ươ|ỗi|yệ|ụy|ẫy|oà|ái|ói|uồ|uỷ|oỏ|ệu|ue|oi|ậu|oè|uã|ãi|òi|ơi|ựa|ụi|iể|oá|ìa|ĩu|uẹ|ìu|ầu|ỏe|ối|uẳ|ịa|òe|ai|ọe|yể|ày|ỉu|uỵ|uể|óe|ỉa|ũa|ườ|uè|êu|ẹo|uá|ỏi|uấ|ưỡ|ội|au|iề|ửu|ọi|ảu|uẽ|ầy|ẻo|ao|yế|uẻ|uơ|ưở|iế|uở|ịu|ủa|ẫu|uặ|oằ|oò|ạu|uỳ|ạo|oọ|ưa|oẹ|ui|uậ|ủi|áo|óa|ẩu|ảy|oẵ|áu|ựu|uô|ửa|ễu|uâ|oạ|uổ|uê|ùi|ếu|ời|iu|uo|oé|yễ|oẳ|uớ|ay|iễ|ủy|ướ|oó|eo|ũi|oả|ua|ỏa|ấy|uố|èo|oo|úy|ẩy|ồi|yề|ẽo|uẫ|ứu|ãy|ổi|ía|ảo|ué|uờ|ùa|ia|ều|oa|iệ|àu|õa|oắ|uắ|uả|ứa|ởi|ụa|ũy|òa|íu|éo|oã|uă|uộ|ữu|úa|ải|ỡi|ừu|ểu|oe|õi|ọa|ừa|uệ|uý|uó|ào|uà|ây|oă|uạ|ữa|oặ|uy|ợi|uẩ|uỗ|ão|uế|ưu|ửi|ại|âu|ới|uầ|ĩa|úi|oẻ|ôi|ài|uề|yê|ậy|áy".split("|") all_nguyen_am_ba = "uỷu|uây|ươu|iệu|yếu|yểu|uyế|uyệ|uyề|ưỡi|uôi|ượi|uổi|oay|uào|iễu|oeo|oèo|uỗi|oai|uấy|oái|uỵu|uyể|uồi|oáy|yều|oẹo|uẫy|ưởi|iểu|uầy|iêu|uối|uyễ|ưới|iều|oài|uao|ươi|yêu|ười|uya|oải|ướu|uội|oại|iếu|ượu|uẩy|uyê|uậy".split("|") confusion_set = dict() special_list = set() for syllable in tqdm(vi_syllables_new): # print(syllable) if syllable[0:2] in ["qu", "gi"]: special_list.add(syllable) # print(f"Ignore {syllable}") continue confusion_set[syllable] = dict() syllable_candidates = confusion_set[syllable] syllable_candidates['phu_am_dau'] = set() syllable_candidates['nguyen_am'] = set() syllable_candidates['phu_am_cuoi'] = set() if len(re.findall(regex_nguyen_am_ba, syllable)) != 0: result = re.findall(regex_nguyen_am_ba, syllable) nguyen_am = result[0] elif len(re.findall(regex_nguyen_am_doi, syllable)) != 0: result = re.findall(regex_nguyen_am_doi, syllable) nguyen_am = result[0] elif len(re.findall(regex_nguyen_am_don, syllable)) != 0: result = re.findall(regex_nguyen_am_don, syllable) nguyen_am = result[0] else: raise Exception("Khong co nguyen am") phu_am_dau, phu_am_cuoi = "", "" if len(re.findall(f"(.+){nguyen_am}", syllable)) !=0 : result = re.findall(f"(.+){nguyen_am}", syllable) phu_am_dau = result[0] if len(re.findall(f"{nguyen_am}(.+)", syllable)) !=0 : result = re.findall(f"{nguyen_am}(.+)", syllable) phu_am_cuoi = result[0] ### Error thay đổi phụ âm đầu for candidate in all_phu_am_dau: if "".join([candidate, nguyen_am, phu_am_cuoi]) in vi_syllables_new: syllable_candidates['phu_am_dau'].add("".join([candidate, nguyen_am, phu_am_cuoi])) ### Error thay đổi nguyên âm all_nguyen_am = all_nguyen_am_don + all_nguyen_am_doi + all_nguyen_am_ba for candidate in all_nguyen_am: if "".join([phu_am_dau, candidate, phu_am_cuoi]) in vi_syllables_new: syllable_candidates['nguyen_am'].add("".join([phu_am_dau, candidate, phu_am_cuoi])) ### Error thay đổi phụ âm cuối for candidate in all_phu_am_cuoi: if "".join([phu_am_dau, nguyen_am, candidate]) in vi_syllables_new: syllable_candidates['phu_am_cuoi'].add("".join([phu_am_dau, nguyen_am, candidate])) for syllable in tqdm(special_list): if len(re.findall(regex_nguyen_am_don, syllable)) > 1: phu_am_dau = syllable[0:2] remained = syllable[2:] else: phu_am_dau = syllable[0] remained = syllable[1:] confusion_set[syllable] = dict() syllable_candidates = confusion_set[syllable] syllable_candidates['phu_am_dau'] = set() syllable_candidates['nguyen_am'] = set() syllable_candidates['phu_am_cuoi'] = set() if len(re.findall(regex_nguyen_am_ba, remained)) != 0: result = re.findall(regex_nguyen_am_ba, remained) nguyen_am = result[0] elif len(re.findall(regex_nguyen_am_doi, remained)) != 0: result = re.findall(regex_nguyen_am_doi, remained) nguyen_am = result[0] elif len(re.findall(regex_nguyen_am_don, remained)) != 0: result = re.findall(regex_nguyen_am_don, remained) nguyen_am = result[0] else: nguyen_am, phu_am_cuoi = "", "" phu_am_cuoi = "" if nguyen_am != "" and len(re.findall(f"{nguyen_am}(.+)", remained)) !=0 : result = re.findall(f"{nguyen_am}(.+)", remained) phu_am_cuoi = result[0] ### Error thay đổi phụ âm đầu for candidate in all_phu_am_dau: if "".join([candidate, nguyen_am, phu_am_cuoi]) in vi_syllables_new: syllable_candidates['phu_am_dau'].add("".join([candidate, nguyen_am, phu_am_cuoi])) ### Error thay đổi nguyên âm all_nguyen_am = all_nguyen_am_don + all_nguyen_am_doi + all_nguyen_am_ba for candidate in all_nguyen_am: if "".join([phu_am_dau, candidate, phu_am_cuoi]) in vi_syllables_new: syllable_candidates['nguyen_am'].add("".join([phu_am_dau, candidate, phu_am_cuoi])) ### Error thay đổi phụ âm cuối for candidate in all_phu_am_cuoi: if "".join([phu_am_dau, nguyen_am, candidate]) in vi_syllables_new: syllable_candidates['phu_am_cuoi'].add("".join([phu_am_dau, nguyen_am, candidate])) for key in tqdm(confusion_set.keys()): for key_2_level in confusion_set[key].keys(): try: confusion_set[key][key_2_level].remove(key) except: pass for key in tqdm(confusion_set.keys()): for key_2_level in confusion_set[key].keys(): candidates_to_remove = [] for candidate in confusion_set[key][key_2_level]: similarity = textdistance.damerau_levenshtein.normalized_similarity(key, candidate) if similarity < 0.5: candidates_to_remove.append(candidate) for candidate in candidates_to_remove: confusion_set[key][key_2_level].remove(candidate) keyboard_neighbor = {'a': 'áàảãạ', 'ă': 'ắằẳẵặ', 'â': 'ấầẩẫậ', 'á': 'aàảãạ', 'à': 'aáảãạ', 'ả': 'aáàãạ', 'ã': 'aáàảạ', 'ạ': 'aáàảã', 'ắ': 'ăằẳẵặ', 'ằ': 'ăắẳẵặ', 'ẳ': 'ăắằẵặ', 'ặ': 'ăắằẳẵ', 'ẵ': 'ăắằẳặ', 'ấ': 'âầẩẫậ', 'ầ': 'âấẩẫậ', 'ẩ': 'âấầẫậ', 'ẫ': 'âấầẩậ', 'ậ': 'âấầẩẫ', 'e': 'èéẻẽẹ', 'é': 'eèẻẽẹ', 'è': 'eéẻẽẹ', 'ẻ': 'eéèẽẹ', 'ẽ': 'eéèẻẹ', 'ẹ': 'eéèẻẽ', 'ê': 'ếềểễệ', 'ế': 'êềểễệ', 'ề': 'êếểễệ', 'ể': 'êếềễệ', 'ễ': 'êếềểệ', 'ệ': 'êếềểễ', 'i': 'íìỉĩị', 'í': 'iìỉĩị', 'ì': 'iíỉĩị', 'ỉ': 'iíìĩị', 'ĩ': 'iíìỉị', 'ị': 'iíìỉĩ', 'o': 'òóỏọõ', 'ó': 'oòỏọõ', 'ò': 'oóỏọõ', 'ỏ': 'oóòọõ', 'õ': 'oóòỏọ', 'ọ': 'oóòỏõ', 'ô': 'ốồổỗộ', 'ố': 'ôồổỗộ', 'ồ': 'ôốổỗộ', 'ổ': 'ôốồỗộ', 'ộ': 'ôốồổỗ', 'ỗ': 'ôốồổộ', 'ơ': 'ớờởợỡ', 'ớ': 'ơờởợỡ', 'ờ': 'ơớởợỡ', 'ở': 'ơớờợỡ', 'ợ': 'ơớờởỡ', 'ỡ': 'ơớờởợ', 'u': 'úùủũụ', 'ú': 'uùủũụ', 'ù': 'uúủũụ', 'ủ': 'uúùũụ', 'ũ': 'uúùủụ', 'ụ': 'uúùủũ', 'ư': 'ứừữửự', 'ứ': 'ưừữửự', 'ừ': 'ưứữửự', 'ử': 'ưứừữự', 'ữ': 'ưứừửự', 'ự': 'ưứừữử', 'y': 'ýỳỷỵỹ', 'ý': 'yỳỷỵỹ', 'ỳ': 'yýỷỵỹ', 'ỷ': 'yýỳỵỹ', 'ỵ': 'yýỳỷỹ', 'ỹ': 'yýỳỷỵ'} pattern = "(" + "|".join(keyboard_neighbor.keys()) + "){1}" def make_accent_change_candidates(text): result = re.findall(pattern, text) candidates = [] for candidate in result: [candidates.append(text.replace(candidate, x)) for x in keyboard_neighbor[candidate]] return set(candidates) typo = json.load(open("../noising_resources/typo.json", "r", encoding="utf-8")) typo_pattern = "(" + "|".join(typo.keys()) + "){1}" accent_pattern = "(s|f|r|x|j|1|2|3|4|5){1}" def convert_to_non_telex(text): word = copy(text) candidates = re.findall(typo_pattern, text) for candidate in candidates: replaced = typo[candidate][0] # Move accent to the end of text if len(re.findall(accent_pattern, replaced)) != 0: word = re.sub(candidate, replaced[0:-1], word) word += replaced[-1] else: word = re.sub(candidate, replaced, word) return word def keep_1_distance_candidates(text, nguyen_am_errors : set): nguyen_am_errors = list(nguyen_am_errors) text = convert_to_non_telex(text) distances = [textdistance.damerau_levenshtein(text, convert_to_non_telex(error)) for error in nguyen_am_errors] indies_to_keep = np.where(np.array(distances) <= 1)[0] return set([nguyen_am_errors[i] for i in indies_to_keep]) for key in tqdm(confusion_set.keys()): candidates = make_accent_change_candidates(key) one_distance_candidates = keep_1_distance_candidates(key, confusion_set[key]['nguyen_am']) candidates = candidates.union(one_distance_candidates) high_probs_list = candidates.intersection(confusion_set[key]['nguyen_am']) lower_probs_list = confusion_set[key]['nguyen_am'].difference(high_probs_list) confusion_set[key]['nguyen_am'] = [high_probs_list, lower_probs_list] for key in tqdm(confusion_set.keys()): confusion_set[key]['nguyen_am'] = [list(confusion_set[key]['nguyen_am'][0]), list(confusion_set[key]['nguyen_am'][1])] confusion_set[key]['phu_am_dau'] = list(confusion_set[key]['phu_am_dau']) confusion_set[key]['phu_am_cuoi'] = list(confusion_set[key]['phu_am_cuoi']) with open("../noising_resources/confusion_set.json", "w+", encoding="utf-8") as outfile: print(confusion_set, file = outfile)