OttoYu commited on
Commit
30cac2e
·
verified ·
1 Parent(s): 3c7275e
Files changed (3) hide show
  1. app.py +128 -0
  2. jyutping_results_largec.json +0 -0
  3. lexi-can_key.json +0 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from functools import lru_cache
3
+ import gradio as gr
4
+
5
+ @lru_cache(maxsize=1)
6
+ def load_lexi_can_key(json_file):
7
+ with open(json_file, 'r', encoding='utf-8') as file:
8
+ return json.load(file)
9
+
10
+ @lru_cache(maxsize=1)
11
+ def load_saved_results(json_file):
12
+ with open(json_file, 'r', encoding='utf-8') as file:
13
+ return json.load(file)
14
+
15
+ def preprocess_jyutping_data(jyutping_data):
16
+ char_to_jyutping = {}
17
+ for syllable, mappings in jyutping_data.items():
18
+ for mapping in mappings:
19
+ for char in mapping["漢字"]:
20
+ char_to_jyutping.setdefault(char, syllable)
21
+ return char_to_jyutping
22
+
23
+ def chinese_batch_to_jyutping(text_batch, char_to_jyutping):
24
+ results = []
25
+ for text in text_batch:
26
+ jyutping_result = [char_to_jyutping.get(char, char) for char in text]
27
+ jyutping_split = list(set(jyutping_result))
28
+ results.append({
29
+ "chinese": list(text),
30
+ "jyutping": jyutping_split
31
+ })
32
+ return results
33
+
34
+ def get_similar_initials():
35
+ return {
36
+ 'b': ['d', 'p'],
37
+ 'c': ['s'],
38
+ 'd': ['b', 't'],
39
+ 'f': ['h'],
40
+ 'g': ['gw'],
41
+ 'gw': ['g'],
42
+ 'h': ['f'],
43
+ 'j': ['z'],
44
+ 'jw': ['w'],
45
+ 'l': ['n'],
46
+ 'n': ['l'],
47
+ 'ng': ['n'],
48
+ 'p': ['b'],
49
+ 's': ['c'],
50
+ 't': ['d'],
51
+ 'w': ['jw'],
52
+ 'z': ['j']
53
+ }
54
+
55
+ def are_jyutping_similar(jyutping1, jyutping2, similar_initials):
56
+ initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
57
+ initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
58
+ return initial1 == initial2 or initial2 in similar_initials.get(initial1, [])
59
+
60
+ def match_user_input(user_input):
61
+ jyutping_data = load_lexi_can_key('lexi-can_key.json')
62
+ similar_initials = get_similar_initials()
63
+ saved_results = load_saved_results('jyutping_results_largec.json')
64
+
65
+ char_to_jyutping = preprocess_jyutping_data(jyutping_data)
66
+ jyutping_results = chinese_batch_to_jyutping([user_input], char_to_jyutping)
67
+
68
+ user_jyutping = jyutping_results[0]["jyutping"]
69
+ input_text = jyutping_results[0]["chinese"]
70
+
71
+ exact_match = next((result for result in saved_results if set(user_jyutping).issubset(result["jyutping"])), None)
72
+
73
+ if exact_match:
74
+ return json.dumps({
75
+ "input_text": input_text,
76
+ "input_jyutping": user_jyutping,
77
+ "match": exact_match
78
+ }, ensure_ascii=False, indent=4)
79
+
80
+ closest_match = None
81
+ highest_similarity_score = 0
82
+
83
+ for result in saved_results:
84
+ score = sum(1 for jyutping in result["jyutping"] for uj in user_jyutping if
85
+ are_jyutping_similar(uj, jyutping, similar_initials))
86
+ if score > highest_similarity_score:
87
+ highest_similarity_score = score
88
+ closest_match = result
89
+
90
+ if closest_match:
91
+ return json.dumps({
92
+ "input_text": input_text,
93
+ "input_jyutping": user_jyutping,
94
+ "closest_match": closest_match
95
+ }, ensure_ascii=False, indent=4)
96
+
97
+ return json.dumps({
98
+ "message": "No suitable match found. Please check the input or try different text."
99
+ }, ensure_ascii=False, indent=4)
100
+
101
+ sample_cases = [
102
+ "龍民大廈",
103
+ "得輔導西",
104
+ "賀民天街",
105
+ "荔枝支道",
106
+ "元周街",
107
+ "謝非道",
108
+ "金中道",
109
+ "得立街",
110
+ "地梨根得里"
111
+ ]
112
+
113
+ def gradio_app(sample_case, custom_input):
114
+ user_input = sample_case if sample_case else custom_input
115
+ return match_user_input(user_input)
116
+
117
+ interface = gr.Interface(
118
+ fn=gradio_app,
119
+ inputs=[
120
+ gr.Textbox(placeholder="Or enter text", label="Placename/Street/Building name"),
121
+ gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
122
+ ],
123
+ outputs=gr.JSON(label="Matching Result"),
124
+ title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
125
+ description="Select a sample case or enter Cantonese text, and the app will return a match or the closest match based on phonetic similarity. 選擇一個範例案例或輸入粵語文本,應用程式將傳回粵拼匹配或基於語音相似性的最接近匹配。"
126
+ )
127
+
128
+ interface.launch()
jyutping_results_largec.json ADDED
The diff for this file is too large to render. See raw diff
 
lexi-can_key.json ADDED
The diff for this file is too large to render. See raw diff