Spaces:
Runtime error
Runtime error
class CantonesePhonetics { | |
constructor() { | |
this.charToJyutping = {}; | |
this.savedResults = []; | |
this.similarInitials = { | |
b: ["p", "m"], | |
c: ["z", "s"], | |
d: ["t", "n"], | |
f: ["h", "w"], | |
g: ["k", "ng"], | |
gw: ["kw"], | |
h: ["f", "w"], | |
j: ["z", "c"], | |
k: ["g", "h"], | |
kw: ["gw"], | |
l: ["n"], | |
m: ["n", "b"], | |
n: ["l", "m", "ng"], | |
ng: ["g", "n"], | |
p: ["b", "m"], | |
s: ["c", "z"], | |
t: ["d", "n"], | |
w: ["f", "h"], | |
z: ["c", "j"] | |
}; | |
this.similarFinals = { | |
aa: ["a", "aai", "aau"], | |
aai: ["aa", "ai"], | |
aau: ["aa", "au"], | |
ai: ["ei", "aai"], | |
au: ["ou", "aau"], | |
e: ["i", "ei"], | |
ei: ["ai", "i"], | |
i: ["e", "ei", "yu"], | |
o: ["u", "ou"], | |
oi: ["ui"], | |
ou: ["u", "au"], | |
u: ["o", "ou", "yu"], | |
ui: ["oi"], | |
yu: ["i", "u"] | |
}; | |
} | |
async initialize() { | |
const [jyutpingData, results] = await Promise.all([ | |
fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/lexi-can_key.json").then(response => response.json()), | |
fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/jyutping_results_largec.json").then(response => response.json()) | |
]); | |
this.charToJyutping = this.preprocessJyutpingData(jyutpingData); | |
this.savedResults = results; | |
} | |
preprocessJyutpingData(jyutpingData) { | |
const result = {}; | |
for (const [syllable, mappings] of Object.entries(jyutpingData)) { | |
for (const mapping of mappings) { | |
for (const char of mapping["漢字"]) { | |
result[char] = syllable; | |
} | |
} | |
} | |
return result; | |
} | |
chineseToJyutping(text) { | |
return text.split("").map(char => this.charToJyutping[char] || char); | |
} | |
areJyutpingSimilar(jyutping1, jyutping2) { | |
function splitJyutping(jyutping) { | |
if (jyutping.length > 1 && "wzjgk".includes(jyutping[1])) { | |
return [jyutping.slice(0, 2), jyutping.slice(2)]; | |
} | |
return [jyutping[0], jyutping.slice(1)]; | |
} | |
const [initial1, final1] = splitJyutping(jyutping1); | |
const [initial2, final2] = splitJyutping(jyutping2); | |
const initialMatch = initial1 === initial2 || | |
(this.similarInitials[initial1] && this.similarInitials[initial1].includes(initial2)) || | |
(this.similarInitials[initial2] && this.similarInitials[initial2].includes(initial1)); | |
const finalMatch = final1 === final2 || | |
(this.similarFinals[final1] && this.similarFinals[final1].includes(final2)) || | |
(this.similarFinals[final2] && this.similarFinals[final2].includes(final1)); | |
return initialMatch && finalMatch; | |
} | |
calculatePhoneticSimilarity(userJyutping, resultJyutping) { | |
const similarCount = userJyutping.reduce( | |
(count, uj) => count + resultJyutping.filter(rj => this.areJyutpingSimilar(uj, rj)).length, | |
0 | |
); | |
return similarCount / Math.max(userJyutping.length, resultJyutping.length); | |
} | |
similarity(s1, s2) { | |
let longer = s1, | |
shorter = s2; | |
if (s1.length < s2.length) { | |
longer = s2; | |
shorter = s1; | |
} | |
const longerLength = longer.length; | |
if (longerLength == 0) { | |
return 1.0; | |
} | |
return (longerLength - this.editDistance(longer, shorter)) / longerLength; | |
} | |
editDistance(s1, s2) { | |
s1 = s1.toLowerCase(); | |
s2 = s2.toLowerCase(); | |
const costs = new Array(); | |
for (let i = 0; i <= s1.length; i++) { | |
let lastValue = i; | |
for (let j = 0; j <= s2.length; j++) { | |
if (i == 0) costs[j] = j; | |
else { | |
if (j > 0) { | |
let newValue = costs[j - 1]; | |
if (s1.charAt(i - 1) != s2.charAt(j - 1)) | |
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1; | |
costs[j - 1] = lastValue; | |
lastValue = newValue; | |
} | |
} | |
} | |
if (i > 0) costs[s2.length] = lastValue; | |
} | |
return costs[s2.length]; | |
} | |
matchUserInput(userInput) { | |
const userJyutping = this.chineseToJyutping(userInput); | |
const exactMatch = this.savedResults.find(result => | |
userJyutping.every(uj => result.jyutping.includes(uj)) | |
); | |
if (exactMatch) { | |
return { | |
input_text: userInput, | |
input_jyutping: userJyutping, | |
match: exactMatch, | |
match_type: "exact" | |
}; | |
} | |
const matches = this.savedResults | |
.map(result => { | |
if (!result.text || !result.jyutping) return null; | |
const phoneticScore = this.calculatePhoneticSimilarity(userJyutping, result.jyutping); | |
const textSimilarity = this.similarity(userInput, result.text); | |
const lengthDiff = Math.abs(userInput.length - result.text.length); | |
const lengthPenalty = 1 / (1 + lengthDiff); | |
const totalScore = phoneticScore * 0.7 + textSimilarity * 0.2 + lengthPenalty * 0.1; | |
return { | |
result, | |
score: totalScore | |
}; | |
}) | |
.filter(Boolean); | |
matches.sort((a, b) => b.score - a.score); | |
const topMatches = matches.slice(0, 3); | |
return { | |
input_text: userInput, | |
input_jyutping: userJyutping, | |
matches: topMatches.map(match => ({ | |
match: match.result, | |
score: match.score, | |
match_type: "phonetic_similarity" | |
})) | |
}; | |
} | |
} | |
const phonetics = new CantonesePhonetics(); |