Spaces:
Runtime error
Runtime error
File size: 1,474 Bytes
44db343 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import json
from tqdm import tqdm
import sys
from viet_text_tools import normalize_diacritics
sys.path.append("..")
from utils.logger import get_logger
import re
vsec_path = "../data/vsec/VSEC.jsonl"
test_file = open("../data/vsec/vsec.test", "w+")
test_noise_file = open("../data/vsec/vsec.test.noise", "w+")
with open(vsec_path, "r") as file:
data = [json.loads(x[0:-1]) for x in file.readlines()]
def get_true_text(sentence: dict):
true_tokens = []
for word in sentence['annotations']:
if word['is_correct'] == True:
true_tokens.append(word['current_syllable'])
else:
true_tokens.append(word['alternative_syllables'][0])
true_sentence = " ".join(true_tokens)
words = re.findall("\w+|[^\w\s]{1}", true_sentence)
return " ".join(words)
def get_noise_text(sentence: dict):
noised_tokens = []
for word in sentence['annotations']:
noised_tokens.append(word['current_syllable'])
noised_sentence = " ".join(noised_tokens)
words = re.findall("\w+|[^\w\s]{1}", noised_sentence)
noised_tokens = []
for word in words:
new_word = normalize_diacritics(word)
noised_tokens.append(new_word)
return " ".join(noised_tokens)
for sentence in tqdm(data):
true_text = get_true_text(sentence)
noised_text = get_noise_text(sentence)
test_file.write(true_text + "\n")
test_noise_file.write(noised_text + "\n")
test_file.close()
test_noise_file.close() |