Spaces:

sparanoid
/

milky-green-svc

Build error

App Files Files Community

sparanoid commited on Oct 21, 2022

Commit

55836f8

•

1 Parent(s): c8318dc

feat: update deps

Browse files

Files changed (4) hide show

requirements.txt +16 -0
slicer.py +163 -0
transforms.py +191 -0
utils.py +263 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+Cython==0.29.21
+librosa==0.8.0
+matplotlib==3.3.1
+numpy==1.18.5
+phonemizer==2.2.1
+scipy==1.5.2
+torch
+torchvision
+Unidecode==1.1.1
+torchaudio
+pyworld
+scipy
+keras
+mir-eval
+pretty-midi
+pydub

slicer.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import os.path
+import time
+from argparse import ArgumentParser
+import librosa
+import numpy as np
+import soundfile
+from scipy.ndimage import maximum_filter1d, uniform_filter1d
+def timeit(func):
+    def run(*args, **kwargs):
+        t = time.time()
+        res = func(*args, **kwargs)
+        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
+        return res
+    return run
+# @timeit
+def _window_maximum(arr, win_sz):
+    return maximum_filter1d(arr, size=win_sz)[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
+# @timeit
+def _window_rms(arr, win_sz):
+    filtered = np.sqrt(uniform_filter1d(np.power(arr, 2), win_sz) - np.power(uniform_filter1d(arr, win_sz), 2))
+    return filtered[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
+def level2db(levels, eps=1e-12):
+    return 20 * np.log10(np.clip(levels, a_min=eps, a_max=1))
+def _apply_slice(audio, begin, end):
+    if len(audio.shape) > 1:
+        return audio[:, begin: end]
+    else:
+        return audio[begin: end]
+class Slicer:
+    def __init__(self,
+                 sr: int,
+                 db_threshold: float = -40,
+                 min_length: int = 5000,
+                 win_l: int = 300,
+                 win_s: int = 20,
+                 max_silence_kept: int = 500):
+        self.db_threshold = db_threshold
+        self.min_samples = round(sr * min_length / 1000)
+        self.win_ln = round(sr * win_l / 1000)
+        self.win_sn = round(sr * win_s / 1000)
+        self.max_silence = round(sr * max_silence_kept / 1000)
+        if not self.min_samples >= self.win_ln >= self.win_sn:
+            raise ValueError('The following condition must be satisfied: min_length >= win_l >= win_s')
+        if not self.max_silence >= self.win_sn:
+            raise ValueError('The following condition must be satisfied: max_silence_kept >= win_s')
+    @timeit
+    def slice(self, audio):
+        if len(audio.shape) > 1:
+            samples = librosa.to_mono(audio)
+        else:
+            samples = audio
+        if samples.shape[0] <= self.min_samples:
+            return [audio]
+        # get absolute amplitudes
+        abs_amp = np.abs(samples - np.mean(samples))
+        # calculate local maximum with large window
+        win_max_db = level2db(_window_maximum(abs_amp, win_sz=self.win_ln))
+        sil_tags = []
+        left = right = 0
+        while right < win_max_db.shape[0]:
+            if win_max_db[right] < self.db_threshold:
+                right += 1
+            elif left == right:
+                left += 1
+                right += 1
+            else:
+                if left == 0:
+                    split_loc_l = left
+                else:
+                    sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
+                    rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
+                    split_win_l = left + np.argmin(rms_db_left)
+                    split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
+                if len(sil_tags) != 0 and split_loc_l - sil_tags[-1][1] < self.min_samples and right < win_max_db.shape[
+                    0] - 1:
+                    right += 1
+                    left = right
+                    continue
+                if right == win_max_db.shape[0] - 1:
+                    split_loc_r = right + self.win_ln
+                else:
+                    sil_right_n = min(self.max_silence, (right + self.win_ln - left) // 2)
+                    rms_db_right = level2db(_window_rms(samples[right + self.win_ln - sil_right_n: right + self.win_ln],
+                                                        win_sz=self.win_sn))
+                    split_win_r = right + self.win_ln - sil_right_n + np.argmin(rms_db_right)
+                    split_loc_r = split_win_r + np.argmin(abs_amp[split_win_r: split_win_r + self.win_sn])
+                sil_tags.append((split_loc_l, split_loc_r))
+                right += 1
+                left = right
+        if left != right:
+            sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
+            rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
+            split_win_l = left + np.argmin(rms_db_left)
+            split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
+            sil_tags.append((split_loc_l, samples.shape[0]))
+        if len(sil_tags) == 0:
+            return [audio]
+        else:
+            chunks = []
+            for i in range(0, len(sil_tags)):
+                chunks.append(int((sil_tags[i][0] + sil_tags[i][1]) / 2))
+            return chunks
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('audio', type=str, help='The audio to be sliced')
+    parser.add_argument('--out_name', type=str, help='Output directory of the sliced audio clips')
+    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
+    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
+                        help='The dB threshold for silence detection')
+    parser.add_argument('--min_len', type=int, required=False, default=5000,
+                        help='The minimum milliseconds required for each sliced audio clip')
+    parser.add_argument('--win_l', type=int, required=False, default=300,
+                        help='Size of the large sliding window, presented in milliseconds')
+    parser.add_argument('--win_s', type=int, required=False, default=20,
+                        help='Size of the small sliding window, presented in milliseconds')
+    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
+                        help='The maximum silence length kept around the sliced audio, presented in milliseconds')
+    args = parser.parse_args()
+    out = args.out
+    if out is None:
+        out = os.path.dirname(os.path.abspath(args.audio))
+    audio, sr = librosa.load(args.audio, sr=None)
+    slicer = Slicer(
+        sr=sr,
+        db_threshold=args.db_thresh,
+        min_length=args.min_len,
+        win_l=args.win_l,
+        win_s=args.win_s,
+        max_silence_kept=args.max_sil_kept
+    )
+    chunks = slicer.slice(audio)
+    if not os.path.exists(args.out):
+        os.makedirs(args.out)
+    start = 0
+    end_id = 0
+    for i, chunk in enumerate(chunks):
+        end = chunk
+        soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(i).zfill(2))), audio[start:end], sr)
+        start = end
+        end_id = i + 1
+    soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(end_id).zfill(2))), audio[start:len(audio)],
+                    sr)
+if __name__ == '__main__':
+    main()

transforms.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+import torch
+from torch.nn import functional as t_func
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+def piecewise_rational_quadratic_transform(inputs,
+                                           unnormalized_widths,
+                                           unnormalized_heights,
+                                           unnormalized_derivatives,
+                                           inverse=False,
+                                           tails=None,
+                                           tail_bound=1.,
+                                           min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                           min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                           min_derivative=DEFAULT_MIN_DERIVATIVE):
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {
+            'tails': tails,
+            'tail_bound': tail_bound
+        }
+    outputs, logabsdet = spline_fn(
+        inputs=inputs,
+        unnormalized_widths=unnormalized_widths,
+        unnormalized_heights=unnormalized_heights,
+        unnormalized_derivatives=unnormalized_derivatives,
+        inverse=inverse,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+        **spline_kwargs
+    )
+    return outputs, logabsdet
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(
+        inputs[..., None] >= bin_locations,
+        dim=-1
+    ) - 1
+def unconstrained_rational_quadratic_spline(inputs,
+                                            unnormalized_widths,
+                                            unnormalized_heights,
+                                            unnormalized_derivatives,
+                                            inverse=False,
+                                            tails='linear',
+                                            tail_bound=1.,
+                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+    if tails == 'linear':
+        unnormalized_derivatives = t_func.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError('{} tails are not implemented.'.format(tails))
+    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative
+    )
+    return outputs, logabsdet
+def rational_quadratic_spline(inputs,
+                              unnormalized_widths,
+                              unnormalized_heights,
+                              unnormalized_derivatives,
+                              inverse=False,
+                              left=0., right=1., bottom=0., top=1.,
+                              min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                              min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                              min_derivative=DEFAULT_MIN_DERIVATIVE):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError('Input to a transform is not within its domain')
+    num_bins = unnormalized_widths.shape[-1]
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError('Minimal bin width too large for the number of bins')
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError('Minimal bin height too large for the number of bins')
+    widths = t_func.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = t_func.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+    derivatives = min_derivative + t_func.softplus(unnormalized_derivatives)
+    heights = t_func.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = t_func.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+    if inverse:
+        a = (((inputs - input_cumheights) * (input_derivatives
+                                             + input_derivatives_plus_one
+                                             - 2 * input_delta)
+              + input_heights * (input_delta - input_derivatives)))
+        b = (input_heights * input_derivatives
+             - (inputs - input_cumheights) * (input_derivatives
+                                              + input_derivatives_plus_one
+                                              - 2 * input_delta))
+        c = - input_delta * (inputs - input_cumheights)
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - root).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+        numerator = input_heights * (input_delta * theta.pow(2)
+                                     + input_derivatives * theta_one_minus_theta)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        outputs = input_cumheights + numerator / denominator
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - theta).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, logabsdet

utils.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import argparse
+import glob
+import json
+import logging
+import os
+import subprocess
+import sys
+import numpy as np
+import torch
+from scipy.io.wavfile import read
+MATPLOTLIB_FLAG = False
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    iteration = checkpoint_dict['iteration']
+    learning_rate = checkpoint_dict['learning_rate']
+    if optimizer is not None:
+        optimizer.load_state_dict(checkpoint_dict['optimizer'])
+    # print(1111)
+    saved_state_dict = checkpoint_dict['model']
+    # print(1111)
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except Exception as e:
+            logger.info(e)
+            logger.info("%s is not in the checkpoint" % k)
+            new_state_dict[k] = v
+    if hasattr(model, 'module'):
+        model.module.load_state_dict(new_state_dict)
+    else:
+        model.load_state_dict(new_state_dict)
+    logger.info("Loaded checkpoint '{}' (iteration {})".format(
+        checkpoint_path, iteration))
+    return model, optimizer, learning_rate, iteration
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+    logger.info("Saving model and optimizer state at iteration {} to {}".format(
+        iteration, checkpoint_path))
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    torch.save({'model': state_dict,
+                'iteration': iteration,
+                'optimizer': optimizer.state_dict(),
+                'learning_rate': learning_rate}, checkpoint_path)
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+    for k, v in scalars.items():
+        writer.add_scalar(k, v, global_step)
+    for k, v in histograms.items():
+        writer.add_histogram(k, v, global_step)
+    for k, v in images.items():
+        writer.add_image(k, v, global_step, dataformats='HWC')
+    for k, v in audios.items():
+        writer.add_audio(k, v, global_step, audio_sampling_rate)
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+    f_list = glob.glob(os.path.join(dir_path, regex))
+    f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+    x = f_list[-1]
+    print(x)
+    return x
+def plot_spectrogram_to_numpy(spectrogram):
+    global MATPLOTLIB_FLAG
+    if not MATPLOTLIB_FLAG:
+        import matplotlib
+        matplotlib.use("Agg")
+        MATPLOTLIB_FLAG = True
+        mpl_logger = logging.getLogger('matplotlib')
+        mpl_logger.setLevel(logging.WARNING)
+    import matplotlib.pylab as plt
+    import numpy
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = numpy.fromstring(fig.canvas.tostring_rgb(), dtype=numpy.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
+def plot_alignment_to_numpy(alignment, info=None):
+    global MATPLOTLIB_FLAG
+    if not MATPLOTLIB_FLAG:
+        import matplotlib
+        matplotlib.use("Agg")
+        MATPLOTLIB_FLAG = True
+        mpl_logger = logging.getLogger('matplotlib')
+        mpl_logger.setLevel(logging.WARNING)
+    import matplotlib.pylab as plt
+    import numpy
+    fig, ax = plt.subplots(figsize=(6, 4))
+    im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                   interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Decoder timestep'
+    if info is not None:
+        xlabel += '\n\n' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Encoder timestep')
+    plt.tight_layout()
+    fig.canvas.draw()
+    data = numpy.fromstring(fig.canvas.tostring_rgb(), dtype=numpy.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
+def load_wav_to_torch(full_path):
+    sampling_rate, data = read(full_path)
+    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+def load_filepaths_and_text(filename, split="|"):
+    with open(filename, encoding='utf-8') as f:
+        filepaths_and_text = [line.strip().split(split) for line in f]
+    return filepaths_and_text
+def get_hparams(init=True):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+                        help='JSON file for configuration')
+    parser.add_argument('-m', '--model', type=str, required=True,
+                        help='Model name')
+    args = parser.parse_args()
+    model_dir = os.path.join("./logs", args.model)
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    config_path = args.config
+    config_save_path = os.path.join(model_dir, "config.json")
+    if init:
+        with open(config_path, "r") as f:
+            data = f.read()
+        with open(config_save_path, "w") as f:
+            f.write(data)
+    else:
+        with open(config_save_path, "r") as f:
+            data = f.read()
+    config = json.loads(data)
+    hparams = HParams(**config)
+    hparams.model_dir = model_dir
+    return hparams
+def get_hparams_from_dir(model_dir):
+    config_save_path = os.path.join(model_dir, "config.json")
+    with open(config_save_path, "r") as f:
+        data = f.read()
+    config = json.loads(data)
+    hparams = HParams(**config)
+    hparams.model_dir = model_dir
+    return hparams
+def get_hparams_from_file(config_path):
+    with open(config_path, "r", encoding="utf-8") as f:
+        data = f.read()
+    config = json.loads(data)
+    hparams = HParams(**config)
+    return hparams
+def check_git_hash(model_dir):
+    source_dir = os.path.dirname(os.path.realpath(__file__))
+    if not os.path.exists(os.path.join(source_dir, ".git")):
+        logger.warning("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+            source_dir
+        ))
+        return
+    cur_hash = subprocess.getoutput("git rev-parse HEAD")
+    path = os.path.join(model_dir, "githash")
+    if os.path.exists(path):
+        saved_hash = open(path).read()
+        if saved_hash != cur_hash:
+            logger.warning("git hash values are different. {}(saved) != {}(current)".format(
+                saved_hash[:8], cur_hash[:8]))
+    else:
+        open(path, "w").write(cur_hash)
+def get_logger(model_dir, filename="train.log"):
+    global logger
+    logger = logging.getLogger(os.path.basename(model_dir))
+    logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    h = logging.FileHandler(os.path.join(model_dir, filename))
+    h.setLevel(logging.DEBUG)
+    h.setFormatter(formatter)
+    logger.addHandler(h)
+    return logger
+class HParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()