|
import os |
|
import json |
|
import soundfile as sf |
|
import numpy as np |
|
import argparse |
|
import math |
|
|
|
|
|
def generate_offline_input(args): |
|
wav_file = args.audio_file |
|
print("Reading {}".format(wav_file)) |
|
waveform, sample_rate = sf.read(wav_file) |
|
batch_size = 1 |
|
mat = np.array([waveform] * batch_size, dtype=np.float32) |
|
|
|
out_dict = { |
|
"data": [ |
|
{ |
|
"WAV_LENS": [len(waveform)], |
|
"WAV": { |
|
"shape": [len(waveform)], |
|
"content": mat.flatten().tolist(), |
|
}, |
|
} |
|
] |
|
} |
|
json.dump(out_dict, open("offline_input.json", "w")) |
|
|
|
|
|
def generate_online_input(args): |
|
wav_file = args.audio_file |
|
waveform, sample_rate = sf.read(wav_file) |
|
chunk_size, subsampling = args.chunk_size, args.subsampling |
|
context = args.context |
|
first_chunk_length = (chunk_size - 1) * subsampling + context |
|
frame_length_ms, frame_shift_ms = args.frame_length_ms, args.frame_shift_ms |
|
|
|
|
|
add_frames = math.ceil((frame_length_ms - frame_shift_ms) / frame_shift_ms) |
|
first_chunk_ms = (first_chunk_length + add_frames) * frame_shift_ms |
|
other_chunk_ms = chunk_size * subsampling * frame_shift_ms |
|
first_chunk_s = first_chunk_ms / 1000 |
|
other_chunk_s = other_chunk_ms / 1000 |
|
|
|
wav_segs = [] |
|
i = 0 |
|
while i < len(waveform): |
|
if i == 0: |
|
stride = int(first_chunk_s * sample_rate) |
|
wav_segs.append(waveform[i : i + stride]) |
|
else: |
|
stride = int(other_chunk_s * sample_rate) |
|
wav_segs.append(waveform[i : i + stride]) |
|
i += len(wav_segs[-1]) |
|
|
|
data = {"data": [[]]} |
|
|
|
for idx, seg in enumerate(wav_segs): |
|
chunk_len = len(seg) |
|
if idx == 0: |
|
length = int(first_chunk_s * sample_rate) |
|
expect_input = np.zeros((1, length), dtype=np.float32) |
|
else: |
|
length = int(other_chunk_s * sample_rate) |
|
expect_input = np.zeros((1, length), dtype=np.float32) |
|
|
|
expect_input[0][0:chunk_len] = seg |
|
|
|
flat_chunk = expect_input.flatten().astype(np.float32).tolist() |
|
seq = { |
|
"WAV": {"content": flat_chunk, "shape": expect_input[0].shape}, |
|
"WAV_LENS": [chunk_len], |
|
} |
|
data["data"][0].append(seq) |
|
|
|
json.dump(data, open("online_input.json", "w")) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--audio_file", type=str, default=None, help="single wav file" |
|
) |
|
|
|
parser.add_argument("--streaming", action="store_true", required=False) |
|
parser.add_argument( |
|
"--sample_rate", |
|
type=int, |
|
required=False, |
|
default=16000, |
|
help="sample rate used in training", |
|
) |
|
parser.add_argument( |
|
"--frame_length_ms", |
|
type=int, |
|
required=False, |
|
default=25, |
|
help="frame length used in training", |
|
) |
|
parser.add_argument( |
|
"--frame_shift_ms", |
|
type=int, |
|
required=False, |
|
default=10, |
|
help="frame shift length used in training", |
|
) |
|
parser.add_argument( |
|
"--chunk_size", |
|
type=int, |
|
required=False, |
|
default=16, |
|
help="chunk size default is 16", |
|
) |
|
parser.add_argument( |
|
"--context", |
|
type=int, |
|
required=False, |
|
default=7, |
|
help="conformer context default is 7", |
|
) |
|
parser.add_argument( |
|
"--subsampling", |
|
type=int, |
|
required=False, |
|
default=4, |
|
help="subsampling rate default is 4", |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
if args.streaming and os.path.exists(args.audio_file): |
|
generate_online_input(args) |
|
else: |
|
generate_offline_input(args) |
|
|