Spaces:

hoang1007
/

spelling-correction

Runtime error

File size: 1,474 Bytes

44db343

import json
from tqdm import tqdm
import sys
from viet_text_tools import normalize_diacritics
sys.path.append("..")
from utils.logger import get_logger
import re
vsec_path = "../data/vsec/VSEC.jsonl"
test_file = open("../data/vsec/vsec.test", "w+")
test_noise_file = open("../data/vsec/vsec.test.noise", "w+")

with open(vsec_path, "r") as file:
    data = [json.loads(x[0:-1]) for x in file.readlines()]

def get_true_text(sentence: dict):
    true_tokens = []
    for word in sentence['annotations']:
        if word['is_correct'] == True:
            true_tokens.append(word['current_syllable'])
        else:
            true_tokens.append(word['alternative_syllables'][0])
    true_sentence =  " ".join(true_tokens)
    words = re.findall("\w+|[^\w\s]{1}", true_sentence)
    return " ".join(words)

def get_noise_text(sentence: dict):
    noised_tokens = []
    for word in sentence['annotations']:
        noised_tokens.append(word['current_syllable'])
    noised_sentence = " ".join(noised_tokens)
    words = re.findall("\w+|[^\w\s]{1}", noised_sentence)   
    noised_tokens = []
    for word in words:
        new_word = normalize_diacritics(word)
        noised_tokens.append(new_word)  
    return " ".join(noised_tokens)

for sentence in tqdm(data):
    true_text = get_true_text(sentence)
    noised_text = get_noise_text(sentence)

    test_file.write(true_text + "\n")
    test_noise_file.write(noised_text + "\n")

test_file.close()
test_noise_file.close()