File size: 2,240 Bytes
f5b4ff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# @ [email protected]

import os
import argparse
from tqdm import tqdm
import pandas as pd

def parse_args():
    parser = argparse.ArgumentParser(description="encode the librilight dataset using encodec model")
    parser.add_argument("--dataset_name", type=str, default='English', help='name tag of dataset')
    parser.add_argument('--dataset_dir', type=str, default=None)
    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
    parser.add_argument("--audio_min_length", type=float, default=1, help="in second, drop the audio if length is shorter than this")
    parser.add_argument("--encodec_sr", type=int, default=50, help="for my encodec that takes 16kHz audio with a downsample rate of 320, the codec sample rate is 50Hz, i.e. 50 codes (x n_codebooks) per second")
    return parser.parse_args()
    
if __name__ == "__main__":
    args = parse_args()
    phn_save_root = os.path.join(args.save_dir, args.dataset_name, "phonemes")
    codes_save_root = os.path.join(args.save_dir, args.dataset_name, "encodec_16khz_4codebooks")
    manifest_root = os.path.join(args.save_dir, args.dataset_name, "manifest")
    os.makedirs(manifest_root, exist_ok=True)

    splits = ['train', 'validation', 'test']

    for split in splits:
        savelines = []
        jsondata = pd.read_json(path_or_buf=os.path.join(args.dataset_dir, 'trans', split+'.json'), lines=True)
        for key in tqdm(range(len(jsondata))):
            if os.path.exists(os.path.join(phn_save_root, jsondata['segment_id'][key]+".txt")) and os.path.exists(os.path.join(codes_save_root, jsondata['segment_id'][key]+".txt")):
                with open(os.path.join(codes_save_root, jsondata['segment_id'][key]+".txt"), 'r') as fi:
                    x = fi.readlines()
                if len(x[0].split(' ')) > int(args.audio_min_length * args.encodec_sr):
                    savelines.append([jsondata['segment_id'][key], len(x[0].split(' '))])

        outputlines = ''
        for i in range(len(savelines)):
            outputlines+='0\t'+ savelines[i][0]+'\t'+str(savelines[i][1])+'\n'
        with open(os.path.join(manifest_root, split+'.txt'), "w") as f:
            f.write(outputlines)