speech-verification / inference.py

to check the model results

cc1ffa0 verified about 2 months ago

4.62 kB

	import os
	import torch
	import numpy as np
	import librosa
	from torch import nn
	import torch.nn.functional as F

	# Fungsi untuk ekstraksi MFCC
	def extract_mfcc_and_pitch(audio_path, sr=16000, n_mfcc=40):
	"""
	Ekstrak fitur MFCC dan pitch dari file audio
	"""
	# Load audio file
	audio, sr = librosa.load(audio_path, sr=sr)

	# Ekstrak MFCC
	mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

	# Normalisasi MFCC
	mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)

	# Ekstrak pitch menggunakan metode YIN
	pitch = librosa.yin(audio, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6'))
	pitch = np.nan_to_num(pitch, nan=np.nanmean(pitch)) # Handle NaN values

	# Normalisasi pitch
	pitch = (pitch - np.mean(pitch)) / np.std(pitch)

	# Ubah pitch menjadi 2D array untuk konsistensi
	pitch = pitch.reshape(1, -1)

	# Gabungkan MFCC dan pitch
	combined_features = np.vstack([mfcc, pitch])

	return combined_features

	# X-Vector Architecture
	class XVectorNet(nn.Module):
	def __init__(self, input_dim=41, dropout_rate=0.45): # Tambah 1 dimensi untuk pitch
	super(XVectorNet, self).__init__()

	# Frame-level features
	self.layer1 = nn.Conv1d(input_dim, 512, 5, padding=2)
	self.dropout1 = nn.Dropout(dropout_rate)
	self.layer2 = nn.Conv1d(512, 512, 3, padding=1)
	self.dropout2 = nn.Dropout(dropout_rate)
	self.layer3 = nn.Conv1d(512, 512, 3, padding=1)
	self.dropout3 = nn.Dropout(dropout_rate)
	self.layer4 = nn.Conv1d(512, 512, 1)
	self.dropout4 = nn.Dropout(dropout_rate)
	self.layer5 = nn.Conv1d(512, 1500, 1)

	# Statistics pooling
	self.stats_pooling = StatsPooling()

	# Segment-level features
	self.layer6 = nn.Linear(3000, 512)
	self.dropout6 = nn.Dropout(dropout_rate)
	self.layer7 = nn.Linear(512, 512)
	self.dropout7 = nn.Dropout(dropout_rate)
	self.output = nn.Linear(512, 2) # Binary classification

	def forward(self, x):
	x = F.relu(self.layer1(x))
	x = self.dropout1(x)
	x = F.relu(self.layer2(x))
	x = self.dropout2(x)
	x = F.relu(self.layer3(x))
	x = self.dropout3(x)
	x = F.relu(self.layer4(x))
	x = self.dropout4(x)
	x = F.relu(self.layer5(x))

	x = self.stats_pooling(x)

	x = F.relu(self.layer6(x))
	x = self.dropout6(x)
	x = F.relu(self.layer7(x))
	x = self.dropout7(x)
	x = self.output(x)

	return x

	class StatsPooling(nn.Module):
	def forward(self, x):
	mean = torch.mean(x, dim=2)
	std = torch.std(x, dim=2)
	return torch.cat((mean, std), dim=1)

	# Fungsi untuk memuat model
	def load_model(model_path, input_dim=41, dropout_rate=0.45):
	model = XVectorNet(input_dim=input_dim, dropout_rate=dropout_rate)
	model.load_state_dict(torch.load(model_path))
	model.eval()
	return model

	# Fungsi untuk melakukan inference
	def inference(model, audio_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
	# Ekstrak fitur dari file audio
	features = extract_mfcc_and_pitch(audio_path)

	# Konversi ke tensor dan tambahkan dimensi batch
	features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)

	# Lakukan inference
	with torch.no_grad():
	output = model(features_tensor)
	probabilities = F.softmax(output, dim=1)
	predicted_class = torch.argmax(probabilities, dim=1).item()

	return predicted_class, probabilities[:, 1].item()

	# Main execution untuk inference
	def main_inference(model_path, audio_folder):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Muat model
	model = load_model(model_path).to(device)

	# Dapatkan semua file .wav dalam folder
	wav_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]

	# Lakukan inference untuk setiap file
	for wav_file in wav_files:
	audio_path = os.path.join(audio_folder, wav_file)
	predicted_class, probability = inference(model, audio_path, device)
	print(f"File: {wav_file}, Predicted Class: {predicted_class}, Probability: {probability:.4f}")

	if __name__ == "__main__":
	# Path ke model yang telah disimpan
	model_path = 'output/best_overall_model.pth'

	# Path ke folder yang berisi file .wav untuk inference
	audio_folder = '/path/to/folder/test'

	# Jalankan inference
	main_inference(model_path, audio_folder)