File size: 1,474 Bytes
44db343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json
from tqdm import tqdm
import sys
from viet_text_tools import normalize_diacritics
sys.path.append("..")
from utils.logger import get_logger
import re
vsec_path = "../data/vsec/VSEC.jsonl"
test_file = open("../data/vsec/vsec.test", "w+")
test_noise_file = open("../data/vsec/vsec.test.noise", "w+")

with open(vsec_path, "r") as file:
    data = [json.loads(x[0:-1]) for x in file.readlines()]

def get_true_text(sentence: dict):
    true_tokens = []
    for word in sentence['annotations']:
        if word['is_correct'] == True:
            true_tokens.append(word['current_syllable'])
        else:
            true_tokens.append(word['alternative_syllables'][0])
    true_sentence =  " ".join(true_tokens)
    words = re.findall("\w+|[^\w\s]{1}", true_sentence)
    return " ".join(words)

def get_noise_text(sentence: dict):
    noised_tokens = []
    for word in sentence['annotations']:
        noised_tokens.append(word['current_syllable'])
    noised_sentence = " ".join(noised_tokens)
    words = re.findall("\w+|[^\w\s]{1}", noised_sentence)   
    noised_tokens = []
    for word in words:
        new_word = normalize_diacritics(word)
        noised_tokens.append(new_word)  
    return " ".join(noised_tokens)

for sentence in tqdm(data):
    true_text = get_true_text(sentence)
    noised_text = get_noise_text(sentence)

    test_file.write(true_text + "\n")
    test_noise_file.write(noised_text + "\n")

test_file.close()
test_noise_file.close()