Spaces:
Runtime error
Runtime error
Trail
Browse files- app.py +128 -0
- jyutping_results_largec.json +0 -0
- lexi-can_key.json +0 -0
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from functools import lru_cache
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
@lru_cache(maxsize=1)
|
6 |
+
def load_lexi_can_key(json_file):
|
7 |
+
with open(json_file, 'r', encoding='utf-8') as file:
|
8 |
+
return json.load(file)
|
9 |
+
|
10 |
+
@lru_cache(maxsize=1)
|
11 |
+
def load_saved_results(json_file):
|
12 |
+
with open(json_file, 'r', encoding='utf-8') as file:
|
13 |
+
return json.load(file)
|
14 |
+
|
15 |
+
def preprocess_jyutping_data(jyutping_data):
|
16 |
+
char_to_jyutping = {}
|
17 |
+
for syllable, mappings in jyutping_data.items():
|
18 |
+
for mapping in mappings:
|
19 |
+
for char in mapping["漢字"]:
|
20 |
+
char_to_jyutping.setdefault(char, syllable)
|
21 |
+
return char_to_jyutping
|
22 |
+
|
23 |
+
def chinese_batch_to_jyutping(text_batch, char_to_jyutping):
|
24 |
+
results = []
|
25 |
+
for text in text_batch:
|
26 |
+
jyutping_result = [char_to_jyutping.get(char, char) for char in text]
|
27 |
+
jyutping_split = list(set(jyutping_result))
|
28 |
+
results.append({
|
29 |
+
"chinese": list(text),
|
30 |
+
"jyutping": jyutping_split
|
31 |
+
})
|
32 |
+
return results
|
33 |
+
|
34 |
+
def get_similar_initials():
|
35 |
+
return {
|
36 |
+
'b': ['d', 'p'],
|
37 |
+
'c': ['s'],
|
38 |
+
'd': ['b', 't'],
|
39 |
+
'f': ['h'],
|
40 |
+
'g': ['gw'],
|
41 |
+
'gw': ['g'],
|
42 |
+
'h': ['f'],
|
43 |
+
'j': ['z'],
|
44 |
+
'jw': ['w'],
|
45 |
+
'l': ['n'],
|
46 |
+
'n': ['l'],
|
47 |
+
'ng': ['n'],
|
48 |
+
'p': ['b'],
|
49 |
+
's': ['c'],
|
50 |
+
't': ['d'],
|
51 |
+
'w': ['jw'],
|
52 |
+
'z': ['j']
|
53 |
+
}
|
54 |
+
|
55 |
+
def are_jyutping_similar(jyutping1, jyutping2, similar_initials):
|
56 |
+
initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
|
57 |
+
initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
|
58 |
+
return initial1 == initial2 or initial2 in similar_initials.get(initial1, [])
|
59 |
+
|
60 |
+
def match_user_input(user_input):
|
61 |
+
jyutping_data = load_lexi_can_key('lexi-can_key.json')
|
62 |
+
similar_initials = get_similar_initials()
|
63 |
+
saved_results = load_saved_results('jyutping_results_largec.json')
|
64 |
+
|
65 |
+
char_to_jyutping = preprocess_jyutping_data(jyutping_data)
|
66 |
+
jyutping_results = chinese_batch_to_jyutping([user_input], char_to_jyutping)
|
67 |
+
|
68 |
+
user_jyutping = jyutping_results[0]["jyutping"]
|
69 |
+
input_text = jyutping_results[0]["chinese"]
|
70 |
+
|
71 |
+
exact_match = next((result for result in saved_results if set(user_jyutping).issubset(result["jyutping"])), None)
|
72 |
+
|
73 |
+
if exact_match:
|
74 |
+
return json.dumps({
|
75 |
+
"input_text": input_text,
|
76 |
+
"input_jyutping": user_jyutping,
|
77 |
+
"match": exact_match
|
78 |
+
}, ensure_ascii=False, indent=4)
|
79 |
+
|
80 |
+
closest_match = None
|
81 |
+
highest_similarity_score = 0
|
82 |
+
|
83 |
+
for result in saved_results:
|
84 |
+
score = sum(1 for jyutping in result["jyutping"] for uj in user_jyutping if
|
85 |
+
are_jyutping_similar(uj, jyutping, similar_initials))
|
86 |
+
if score > highest_similarity_score:
|
87 |
+
highest_similarity_score = score
|
88 |
+
closest_match = result
|
89 |
+
|
90 |
+
if closest_match:
|
91 |
+
return json.dumps({
|
92 |
+
"input_text": input_text,
|
93 |
+
"input_jyutping": user_jyutping,
|
94 |
+
"closest_match": closest_match
|
95 |
+
}, ensure_ascii=False, indent=4)
|
96 |
+
|
97 |
+
return json.dumps({
|
98 |
+
"message": "No suitable match found. Please check the input or try different text."
|
99 |
+
}, ensure_ascii=False, indent=4)
|
100 |
+
|
101 |
+
sample_cases = [
|
102 |
+
"龍民大廈",
|
103 |
+
"得輔導西",
|
104 |
+
"賀民天街",
|
105 |
+
"荔枝支道",
|
106 |
+
"元周街",
|
107 |
+
"謝非道",
|
108 |
+
"金中道",
|
109 |
+
"得立街",
|
110 |
+
"地梨根得里"
|
111 |
+
]
|
112 |
+
|
113 |
+
def gradio_app(sample_case, custom_input):
|
114 |
+
user_input = sample_case if sample_case else custom_input
|
115 |
+
return match_user_input(user_input)
|
116 |
+
|
117 |
+
interface = gr.Interface(
|
118 |
+
fn=gradio_app,
|
119 |
+
inputs=[
|
120 |
+
gr.Textbox(placeholder="Or enter text", label="Placename/Street/Building name"),
|
121 |
+
gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
|
122 |
+
],
|
123 |
+
outputs=gr.JSON(label="Matching Result"),
|
124 |
+
title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
|
125 |
+
description="Select a sample case or enter Cantonese text, and the app will return a match or the closest match based on phonetic similarity. 選擇一個範例案例或輸入粵語文本,應用程式將傳回粵拼匹配或基於語音相似性的最接近匹配。"
|
126 |
+
)
|
127 |
+
|
128 |
+
interface.launch()
|
jyutping_results_largec.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lexi-can_key.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|