File size: 2,078 Bytes
5548515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import torch
import numpy as np
from numpy import linalg as LA
import librosa
import soundfile as sf
import librosa.filters


def load_audio_torch(wave_file, fs):
    """Load audio data into torch tensor

    Args:
        wave_file (str): path to wave file
        fs (int): sample rate

    Returns:
        audio (tensor): audio data in tensor
        fs (int): sample rate
    """

    audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True)
    # audio: (T,)
    assert len(audio) > 2

    # Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
    if np.issubdtype(audio.dtype, np.integer):
        max_mag = -np.iinfo(audio.dtype).min
    else:
        max_mag = max(np.amax(audio), -np.amin(audio))
        max_mag = (
            (2**31) + 1
            if max_mag > (2**15)
            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
        )

    # Normalize the audio
    audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag

    if (torch.isnan(audio) | torch.isinf(audio)).any():
        return [], sample_rate or fs or 48000

    # Resample the audio to our target samplerate
    if fs is not None and fs != sample_rate:
        audio = torch.from_numpy(
            librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs)
        )
        sample_rate = fs

    return audio, fs


def _stft(y, cfg):
    return librosa.stft(
        y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size
    )


def energy(wav, cfg):
    D = _stft(wav, cfg)
    magnitudes = np.abs(D).T  # [F, T]
    return LA.norm(magnitudes, axis=1)


def get_energy_from_tacotron(audio, _stft):
    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
    audio = torch.autograd.Variable(audio, requires_grad=False)
    mel, energy = _stft.mel_spectrogram(audio)
    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
    return mel, energy