Spaces:
Running
on
Zero
Running
on
Zero
import json | |
import tqdm | |
import pandas as pd | |
import os | |
import random | |
# Load the JSON data | |
jsondata = pd.read_json(path_or_buf=os.path.join('./Gigaspeech', 'trans', 'train.json'), lines=True) | |
# Ensure there are at least 10,000 items in jsondata | |
data_length = len(jsondata) | |
# Randomly sample 800,000 unique indices | |
sample_indices = range(data_length) if data_length < 800000 else random.sample(range(data_length), 800000) | |
# Prepare the sampled data | |
data = [] | |
for i in tqdm.tqdm(sample_indices): | |
tmp = { | |
"path": jsondata['wav'][i], | |
"duration": jsondata['duration'][i], | |
"sample_rate": 16000, | |
"amplitude": None, | |
"weight": None, | |
"info_path": None | |
} | |
data.append(tmp) | |
# Create the output directory if it does not exist | |
os.makedirs('./egs/train', exist_ok=True) | |
# Define the output file path | |
output_file = './egs/train/data.jsonl' | |
# Write the sampled data to the JSONL file | |
with open(output_file, 'w') as file: | |
for record in data: | |
json_line = json.dumps(record) | |
file.write(json_line + '\n') | |
print(f"Data written to {output_file}") | |