Spaces:
Runtime error
Runtime error
import json | |
from tqdm import tqdm | |
import sys | |
from viet_text_tools import normalize_diacritics | |
sys.path.append("..") | |
from utils.logger import get_logger | |
import re | |
vsec_path = "../data/vsec/VSEC.jsonl" | |
test_file = open("../data/vsec/vsec.test", "w+") | |
test_noise_file = open("../data/vsec/vsec.test.noise", "w+") | |
with open(vsec_path, "r") as file: | |
data = [json.loads(x[0:-1]) for x in file.readlines()] | |
def get_true_text(sentence: dict): | |
true_tokens = [] | |
for word in sentence['annotations']: | |
if word['is_correct'] == True: | |
true_tokens.append(word['current_syllable']) | |
else: | |
true_tokens.append(word['alternative_syllables'][0]) | |
true_sentence = " ".join(true_tokens) | |
words = re.findall("\w+|[^\w\s]{1}", true_sentence) | |
return " ".join(words) | |
def get_noise_text(sentence: dict): | |
noised_tokens = [] | |
for word in sentence['annotations']: | |
noised_tokens.append(word['current_syllable']) | |
noised_sentence = " ".join(noised_tokens) | |
words = re.findall("\w+|[^\w\s]{1}", noised_sentence) | |
noised_tokens = [] | |
for word in words: | |
new_word = normalize_diacritics(word) | |
noised_tokens.append(new_word) | |
return " ".join(noised_tokens) | |
for sentence in tqdm(data): | |
true_text = get_true_text(sentence) | |
noised_text = get_noise_text(sentence) | |
test_file.write(true_text + "\n") | |
test_noise_file.write(noised_text + "\n") | |
test_file.close() | |
test_noise_file.close() |