import os import torch import numpy as np import librosa from torch import nn import torch.nn.functional as F # Fungsi untuk ekstraksi MFCC def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40): """ Ekstrak fitur MFCC dan pitch dari file audio """ # Load audio file audio, sr = librosa.load(audio_path, sr=sr) # Ekstrak MFCC mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc) # Normalisasi MFCC mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc) # Ekstrak pitch menggunakan metode YIN pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6')) pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch)) # Handle NaN values # Normalisasi pitch pitch = (pitch - np.mean(pitch)) / np.std(pitch) # Ubah pitch menjadi 2D array untuk konsistensi pitch = pitch.reshape(1, -1) # Gabungkan MFCC dan pitch combined_features = np.vstack([mfcc, pitch]) return combined_features # X-Vector Architecture class XVectorNet(nn.Module): def __init__(self, input_dim=41, dropout_rate=0.45): # Tambah 1 dimensi untuk pitch super(XVectorNet, self).__init__() # Frame-level features self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2) self.dropout1 = nn.Dropout(dropout_rate) self.layer2 = nn.Conv1d(512, 512, 3, padding=1) self.dropout2 = nn.Dropout(dropout_rate) self.layer3 = nn.Conv1d(512, 512, 3, padding=1) self.dropout3 = nn.Dropout(dropout_rate) self.layer4 = nn.Conv1d(512, 512, 1) self.dropout4 = nn.Dropout(dropout_rate) self.layer5 = nn.Conv1d(512, 1500, 1) # Statistics pooling self.stats_pooling = StatsPooling() # Segment-level features self.layer6 = nn.Linear(3000, 512) self.dropout6 = nn.Dropout(dropout_rate) self.layer7 = nn.Linear(512, 512) self.dropout7 = nn.Dropout(dropout_rate) self.output = nn.Linear(512, 2) # Binary classification def forward(self, x): x = F.relu(self.layer1(x)) x = self.dropout1(x) x = F.relu(self.layer2(x)) x = self.dropout2(x) x = F.relu(self.layer3(x)) x = self.dropout3(x) x = F.relu(self.layer4(x)) x = self.dropout4(x) x = F.relu(self.layer5(x)) x = self.stats_pooling(x) x = F.relu(self.layer6(x)) x = self.dropout6(x) x = F.relu(self.layer7(x)) x = self.dropout7(x) x = self.output(x) return x class StatsPooling(nn.Module): def forward(self, x): mean = torch.mean(x, dim=2) std = torch.std(x, dim=2) return torch.cat((mean, std), dim=1) # Fungsi untuk memuat model def load_model(model_path, input_dim=41, dropout_rate=0.45): model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate) model.load_state_dict(torch.load(model_path)) model.eval() return model # Fungsi untuk melakukan inference def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'): # Ekstrak fitur dari file audio features = extract_mfcc_and_pitch(audio_path) # Konversi ke tensor dan tambahkan dimensi batch features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) # Lakukan inference with torch.no_grad(): output = model(features_tensor) probabilities = F.softmax(output, dim=1) predicted_class = torch.argmax(probabilities, dim=1).item() return predicted_class, probabilities[:, 1].item() # Main execution untuk inference def main_inference(model_path, audio_folder): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Muat model model = load_model(model_path).to(device) # Dapatkan semua file .wav dalam folder wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')] # Lakukan inference untuk setiap file for wav_file in wav_files: audio_path = os.path.join(audio_folder, wav_file) predicted_class, probability = inference(model, audio_path, device) print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}") if __name__ == "__main__": # Path ke model yang telah disimpan model_path = 'output/best_overall_model.pth' # Path ke folder yang berisi file .wav untuk inference audio_folder = '/path/to/folder/test' # Jalankan inference main_inference(model_path, audio_folder)