Cantonese-Phonetics / CantonesePhonetics.js
OttoYu's picture
Upload CantonesePhonetics.js
ab38ebf verified
class CantonesePhonetics {
constructor() {
this.charToJyutping = {};
this.savedResults = [];
this.similarInitials = {
b: ["p", "m"],
c: ["z", "s"],
d: ["t", "n"],
f: ["h", "w"],
g: ["k", "ng"],
gw: ["kw"],
h: ["f", "w"],
j: ["z", "c"],
k: ["g", "h"],
kw: ["gw"],
l: ["n"],
m: ["n", "b"],
n: ["l", "m", "ng"],
ng: ["g", "n"],
p: ["b", "m"],
s: ["c", "z"],
t: ["d", "n"],
w: ["f", "h"],
z: ["c", "j"]
};
this.similarFinals = {
aa: ["a", "aai", "aau"],
aai: ["aa", "ai"],
aau: ["aa", "au"],
ai: ["ei", "aai"],
au: ["ou", "aau"],
e: ["i", "ei"],
ei: ["ai", "i"],
i: ["e", "ei", "yu"],
o: ["u", "ou"],
oi: ["ui"],
ou: ["u", "au"],
u: ["o", "ou", "yu"],
ui: ["oi"],
yu: ["i", "u"]
};
}
async initialize() {
const [jyutpingData, results] = await Promise.all([
fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/lexi-can_key.json").then(response => response.json()),
fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/jyutping_results_largec.json").then(response => response.json())
]);
this.charToJyutping = this.preprocessJyutpingData(jyutpingData);
this.savedResults = results;
}
preprocessJyutpingData(jyutpingData) {
const result = {};
for (const [syllable, mappings] of Object.entries(jyutpingData)) {
for (const mapping of mappings) {
for (const char of mapping["漢字"]) {
result[char] = syllable;
}
}
}
return result;
}
chineseToJyutping(text) {
return text.split("").map(char => this.charToJyutping[char] || char);
}
areJyutpingSimilar(jyutping1, jyutping2) {
function splitJyutping(jyutping) {
if (jyutping.length > 1 && "wzjgk".includes(jyutping[1])) {
return [jyutping.slice(0, 2), jyutping.slice(2)];
}
return [jyutping[0], jyutping.slice(1)];
}
const [initial1, final1] = splitJyutping(jyutping1);
const [initial2, final2] = splitJyutping(jyutping2);
const initialMatch = initial1 === initial2 ||
(this.similarInitials[initial1] && this.similarInitials[initial1].includes(initial2)) ||
(this.similarInitials[initial2] && this.similarInitials[initial2].includes(initial1));
const finalMatch = final1 === final2 ||
(this.similarFinals[final1] && this.similarFinals[final1].includes(final2)) ||
(this.similarFinals[final2] && this.similarFinals[final2].includes(final1));
return initialMatch && finalMatch;
}
calculatePhoneticSimilarity(userJyutping, resultJyutping) {
const similarCount = userJyutping.reduce(
(count, uj) => count + resultJyutping.filter(rj => this.areJyutpingSimilar(uj, rj)).length,
0
);
return similarCount / Math.max(userJyutping.length, resultJyutping.length);
}
similarity(s1, s2) {
let longer = s1,
shorter = s2;
if (s1.length < s2.length) {
longer = s2;
shorter = s1;
}
const longerLength = longer.length;
if (longerLength == 0) {
return 1.0;
}
return (longerLength - this.editDistance(longer, shorter)) / longerLength;
}
editDistance(s1, s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
const costs = new Array();
for (let i = 0; i <= s1.length; i++) {
let lastValue = i;
for (let j = 0; j <= s2.length; j++) {
if (i == 0) costs[j] = j;
else {
if (j > 0) {
let newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1))
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0) costs[s2.length] = lastValue;
}
return costs[s2.length];
}
matchUserInput(userInput) {
const userJyutping = this.chineseToJyutping(userInput);
const exactMatch = this.savedResults.find(result =>
userJyutping.every(uj => result.jyutping.includes(uj))
);
if (exactMatch) {
return {
input_text: userInput,
input_jyutping: userJyutping,
match: exactMatch,
match_type: "exact"
};
}
const matches = this.savedResults
.map(result => {
if (!result.text || !result.jyutping) return null;
const phoneticScore = this.calculatePhoneticSimilarity(userJyutping, result.jyutping);
const textSimilarity = this.similarity(userInput, result.text);
const lengthDiff = Math.abs(userInput.length - result.text.length);
const lengthPenalty = 1 / (1 + lengthDiff);
const totalScore = phoneticScore * 0.7 + textSimilarity * 0.2 + lengthPenalty * 0.1;
return {
result,
score: totalScore
};
})
.filter(Boolean);
matches.sort((a, b) => b.score - a.score);
const topMatches = matches.slice(0, 3);
return {
input_text: userInput,
input_jyutping: userJyutping,
matches: topMatches.map(match => ({
match: match.result,
score: match.score,
match_type: "phonetic_similarity"
}))
};
}
}
const phonetics = new CantonesePhonetics();