|
import os |
|
import torch |
|
import numpy as np |
|
import librosa |
|
from torch import nn |
|
import torch.nn.functional as F |
|
|
|
|
|
def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40): |
|
""" |
|
Ekstrak fitur MFCC dan pitch dari file audio |
|
""" |
|
|
|
audio, sr = librosa.load(audio_path, sr=sr) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc) |
|
|
|
|
|
mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc) |
|
|
|
|
|
pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6')) |
|
pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch)) |
|
|
|
|
|
pitch = (pitch - np.mean(pitch)) / np.std(pitch) |
|
|
|
|
|
pitch = pitch.reshape(1, -1) |
|
|
|
|
|
combined_features = np.vstack([mfcc, pitch]) |
|
|
|
return combined_features |
|
|
|
|
|
class XVectorNet(nn.Module): |
|
def __init__(self, input_dim=41, dropout_rate=0.45): |
|
super(XVectorNet, self).__init__() |
|
|
|
|
|
self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2) |
|
self.dropout1 = nn.Dropout(dropout_rate) |
|
self.layer2 = nn.Conv1d(512, 512, 3, padding=1) |
|
self.dropout2 = nn.Dropout(dropout_rate) |
|
self.layer3 = nn.Conv1d(512, 512, 3, padding=1) |
|
self.dropout3 = nn.Dropout(dropout_rate) |
|
self.layer4 = nn.Conv1d(512, 512, 1) |
|
self.dropout4 = nn.Dropout(dropout_rate) |
|
self.layer5 = nn.Conv1d(512, 1500, 1) |
|
|
|
|
|
self.stats_pooling = StatsPooling() |
|
|
|
|
|
self.layer6 = nn.Linear(3000, 512) |
|
self.dropout6 = nn.Dropout(dropout_rate) |
|
self.layer7 = nn.Linear(512, 512) |
|
self.dropout7 = nn.Dropout(dropout_rate) |
|
self.output = nn.Linear(512, 2) |
|
|
|
def forward(self, x): |
|
x = F.relu(self.layer1(x)) |
|
x = self.dropout1(x) |
|
x = F.relu(self.layer2(x)) |
|
x = self.dropout2(x) |
|
x = F.relu(self.layer3(x)) |
|
x = self.dropout3(x) |
|
x = F.relu(self.layer4(x)) |
|
x = self.dropout4(x) |
|
x = F.relu(self.layer5(x)) |
|
|
|
x = self.stats_pooling(x) |
|
|
|
x = F.relu(self.layer6(x)) |
|
x = self.dropout6(x) |
|
x = F.relu(self.layer7(x)) |
|
x = self.dropout7(x) |
|
x = self.output(x) |
|
|
|
return x |
|
|
|
class StatsPooling(nn.Module): |
|
def forward(self, x): |
|
mean = torch.mean(x, dim=2) |
|
std = torch.std(x, dim=2) |
|
return torch.cat((mean, std), dim=1) |
|
|
|
|
|
def load_model(model_path, input_dim=41, dropout_rate=0.45): |
|
model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate) |
|
model.load_state_dict(torch.load(model_path)) |
|
model.eval() |
|
return model |
|
|
|
|
|
def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'): |
|
|
|
features = extract_mfcc_and_pitch(audio_path) |
|
|
|
|
|
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
output = model(features_tensor) |
|
probabilities = F.softmax(output, dim=1) |
|
predicted_class = torch.argmax(probabilities, dim=1).item() |
|
|
|
return predicted_class, probabilities[:, 1].item() |
|
|
|
|
|
def main_inference(model_path, audio_folder): |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
model = load_model(model_path).to(device) |
|
|
|
|
|
wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')] |
|
|
|
|
|
for wav_file in wav_files: |
|
audio_path = os.path.join(audio_folder, wav_file) |
|
predicted_class, probability = inference(model, audio_path, device) |
|
print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}") |
|
|
|
if __name__ == "__main__": |
|
|
|
model_path = 'output/best_overall_model.pth' |
|
|
|
|
|
audio_folder = '/path/to/folder/test' |
|
|
|
|
|
main_inference(model_path, audio_folder) |