Spaces:

OttoYu
/

Cantonese-Phonetics

Runtime error

App Files Files Community

OttoYu commited on Aug 15, 2024

Commit

30cac2e

verified ·

1 Parent(s): 3c7275e

Trail

Browse files

Files changed (3) hide show

app.py +128 -0
jyutping_results_largec.json +0 -0
lexi-can_key.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import json
+from functools import lru_cache
+import gradio as gr
+@lru_cache(maxsize=1)
+def load_lexi_can_key(json_file):
+    with open(json_file, 'r', encoding='utf-8') as file:
+        return json.load(file)
+@lru_cache(maxsize=1)
+def load_saved_results(json_file):
+    with open(json_file, 'r', encoding='utf-8') as file:
+        return json.load(file)
+def preprocess_jyutping_data(jyutping_data):
+    char_to_jyutping = {}
+    for syllable, mappings in jyutping_data.items():
+        for mapping in mappings:
+            for char in mapping["漢字"]:
+                char_to_jyutping.setdefault(char, syllable)
+    return char_to_jyutping
+def chinese_batch_to_jyutping(text_batch, char_to_jyutping):
+    results = []
+    for text in text_batch:
+        jyutping_result = [char_to_jyutping.get(char, char) for char in text]
+        jyutping_split = list(set(jyutping_result))
+        results.append({
+            "chinese": list(text),
+            "jyutping": jyutping_split
+        })
+    return results
+def get_similar_initials():
+    return {
+        'b': ['d', 'p'],
+        'c': ['s'],
+        'd': ['b', 't'],
+        'f': ['h'],
+        'g': ['gw'],
+        'gw': ['g'],
+        'h': ['f'],
+        'j': ['z'],
+        'jw': ['w'],
+        'l': ['n'],
+        'n': ['l'],
+        'ng': ['n'],
+        'p': ['b'],
+        's': ['c'],
+        't': ['d'],
+        'w': ['jw'],
+        'z': ['j']
+    }
+def are_jyutping_similar(jyutping1, jyutping2, similar_initials):
+    initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
+    initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
+    return initial1 == initial2 or initial2 in similar_initials.get(initial1, [])
+def match_user_input(user_input):
+    jyutping_data = load_lexi_can_key('lexi-can_key.json')
+    similar_initials = get_similar_initials()
+    saved_results = load_saved_results('jyutping_results_largec.json')
+    char_to_jyutping = preprocess_jyutping_data(jyutping_data)
+    jyutping_results = chinese_batch_to_jyutping([user_input], char_to_jyutping)
+    user_jyutping = jyutping_results[0]["jyutping"]
+    input_text = jyutping_results[0]["chinese"]
+    exact_match = next((result for result in saved_results if set(user_jyutping).issubset(result["jyutping"])), None)
+    if exact_match:
+        return json.dumps({
+            "input_text": input_text,
+            "input_jyutping": user_jyutping,
+            "match": exact_match
+        }, ensure_ascii=False, indent=4)
+    closest_match = None
+    highest_similarity_score = 0
+    for result in saved_results:
+        score = sum(1 for jyutping in result["jyutping"] for uj in user_jyutping if
+                    are_jyutping_similar(uj, jyutping, similar_initials))
+        if score > highest_similarity_score:
+            highest_similarity_score = score
+            closest_match = result
+    if closest_match:
+        return json.dumps({
+            "input_text": input_text,
+            "input_jyutping": user_jyutping,
+            "closest_match": closest_match
+        }, ensure_ascii=False, indent=4)
+    return json.dumps({
+        "message": "No suitable match found. Please check the input or try different text."
+    }, ensure_ascii=False, indent=4)
+sample_cases = [
+    "龍民大廈",
+    "得輔導西",
+    "賀民天街",
+    "荔枝支道",
+    "元周街",
+    "謝非道",
+    "金中道",
+    "得立街",
+    "地梨根得里"
+]
+def gradio_app(sample_case, custom_input):
+    user_input = sample_case if sample_case else custom_input
+    return match_user_input(user_input)
+interface = gr.Interface(
+    fn=gradio_app,
+    inputs=[
+        gr.Textbox(placeholder="Or enter text", label="Placename/Street/Building name"),
+        gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
+    ],
+    outputs=gr.JSON(label="Matching Result"),
+    title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
+    description="Select a sample case or enter Cantonese text, and the app will return a match or the closest match based on phonetic similarity. 選擇一個範例案例或輸入粵語文本，應用程式將傳回粵拼匹配或基於語音相似性的最接近匹配。"
+)
+interface.launch()

jyutping_results_largec.json ADDED Viewed

The diff for this file is too large to render. See raw diff

lexi-can_key.json ADDED Viewed

The diff for this file is too large to render. See raw diff