Spaces:

OttoYu
/

Cantonese-Phonetics

Runtime error

App Files Files Community

Cantonese-Phonetics / CantonesePhonetics.js

OttoYu's picture

Upload CantonesePhonetics.js

ab38ebf verified 6 months ago

history blame contribute delete

6 kB

	class CantonesePhonetics {
	constructor() {
	this.charToJyutping = {};
	this.savedResults = [];
	this.similarInitials = {
	b: ["p", "m"],
	c: ["z", "s"],
	d: ["t", "n"],
	f: ["h", "w"],
	g: ["k", "ng"],
	gw: ["kw"],
	h: ["f", "w"],
	j: ["z", "c"],
	k: ["g", "h"],
	kw: ["gw"],
	l: ["n"],
	m: ["n", "b"],
	n: ["l", "m", "ng"],
	ng: ["g", "n"],
	p: ["b", "m"],
	s: ["c", "z"],
	t: ["d", "n"],
	w: ["f", "h"],
	z: ["c", "j"]
	};
	this.similarFinals = {
	aa: ["a", "aai", "aau"],
	aai: ["aa", "ai"],
	aau: ["aa", "au"],
	ai: ["ei", "aai"],
	au: ["ou", "aau"],
	e: ["i", "ei"],
	ei: ["ai", "i"],
	i: ["e", "ei", "yu"],
	o: ["u", "ou"],
	oi: ["ui"],
	ou: ["u", "au"],
	u: ["o", "ou", "yu"],
	ui: ["oi"],
	yu: ["i", "u"]
	};
	}

	async initialize() {
	const [jyutpingData, results] = await Promise.all([
	fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/lexi-can_key.json").then(response => response.json()),
	fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/jyutping_results_largec.json").then(response => response.json())
	]);

	this.charToJyutping = this.preprocessJyutpingData(jyutpingData);
	this.savedResults = results;
	}

	preprocessJyutpingData(jyutpingData) {
	const result = {};
	for (const [syllable, mappings] of Object.entries(jyutpingData)) {
	for (const mapping of mappings) {
	for (const char of mapping["漢字"]) {
	result[char] = syllable;
	}
	}
	}
	return result;
	}

	chineseToJyutping(text) {
	return text.split("").map(char => this.charToJyutping[char] \|\| char);
	}

	areJyutpingSimilar(jyutping1, jyutping2) {
	function splitJyutping(jyutping) {
	if (jyutping.length > 1 && "wzjgk".includes(jyutping[1])) {
	return [jyutping.slice(0, 2), jyutping.slice(2)];
	}
	return [jyutping[0], jyutping.slice(1)];
	}

	const [initial1, final1] = splitJyutping(jyutping1);
	const [initial2, final2] = splitJyutping(jyutping2);

	const initialMatch = initial1 === initial2 \|\|
	(this.similarInitials[initial1] && this.similarInitials[initial1].includes(initial2)) \|\|
	(this.similarInitials[initial2] && this.similarInitials[initial2].includes(initial1));

	const finalMatch = final1 === final2 \|\|
	(this.similarFinals[final1] && this.similarFinals[final1].includes(final2)) \|\|
	(this.similarFinals[final2] && this.similarFinals[final2].includes(final1));

	return initialMatch && finalMatch;
	}

	calculatePhoneticSimilarity(userJyutping, resultJyutping) {
	const similarCount = userJyutping.reduce(
	(count, uj) => count + resultJyutping.filter(rj => this.areJyutpingSimilar(uj, rj)).length,
	0
	);
	return similarCount / Math.max(userJyutping.length, resultJyutping.length);
	}

	similarity(s1, s2) {
	let longer = s1,
	shorter = s2;
	if (s1.length < s2.length) {
	longer = s2;
	shorter = s1;
	}
	const longerLength = longer.length;
	if (longerLength == 0) {
	return 1.0;
	}
	return (longerLength - this.editDistance(longer, shorter)) / longerLength;
	}

	editDistance(s1, s2) {
	s1 = s1.toLowerCase();
	s2 = s2.toLowerCase();

	const costs = new Array();
	for (let i = 0; i <= s1.length; i++) {
	let lastValue = i;
	for (let j = 0; j <= s2.length; j++) {
	if (i == 0) costs[j] = j;
	else {
	if (j > 0) {
	let newValue = costs[j - 1];
	if (s1.charAt(i - 1) != s2.charAt(j - 1))
	newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
	costs[j - 1] = lastValue;
	lastValue = newValue;
	}
	}
	}
	if (i > 0) costs[s2.length] = lastValue;
	}
	return costs[s2.length];
	}

	matchUserInput(userInput) {
	const userJyutping = this.chineseToJyutping(userInput);

	const exactMatch = this.savedResults.find(result =>
	userJyutping.every(uj => result.jyutping.includes(uj))
	);

	if (exactMatch) {
	return {
	input_text: userInput,
	input_jyutping: userJyutping,
	match: exactMatch,
	match_type: "exact"
	};
	}

	const matches = this.savedResults
	.map(result => {
	if (!result.text \|\| !result.jyutping) return null;

	const phoneticScore = this.calculatePhoneticSimilarity(userJyutping, result.jyutping);
	const textSimilarity = this.similarity(userInput, result.text);
	const lengthDiff = Math.abs(userInput.length - result.text.length);
	const lengthPenalty = 1 / (1 + lengthDiff);

	const totalScore = phoneticScore * 0.7 + textSimilarity * 0.2 + lengthPenalty * 0.1;
	return {
	result,
	score: totalScore
	};
	})
	.filter(Boolean);

	matches.sort((a, b) => b.score - a.score);
	const topMatches = matches.slice(0, 3);

	return {
	input_text: userInput,
	input_jyutping: userJyutping,
	matches: topMatches.map(match => ({
	match: match.result,
	score: match.score,
	match_type: "phonetic_similarity"
	}))
	};
	}
	}

	const phonetics = new CantonesePhonetics();