KIFF's picture
Update handler.py
6a71189 verified
from typing import Dict
from pyannote.audio import Pipeline
import torch
import base64
import numpy as np
import os
SAMPLE_RATE = 16000
class EndpointHandler():
def __init__(self, path=""):
# Retrieve the Hugging Face authentication token from the environment variable
hf_token = os.getenv("MY_KEY")
if not hf_token:
raise ValueError("Hugging Face authentication token (MY_KEY) is missing.")
# Initialize the pipeline with the authentication token
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1", use_auth_token=hf_token
)
# Move the pipeline to the appropriate device (CPU or GPU)
self.pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
def __call__(self, data: Dict) -> Dict:
"""
Args:
data (Dict):
'inputs': Base64-encoded audio bytes
'parameters': Additional diarization parameters (currently unused)
Return:
Dict: Speaker diarization results
"""
inputs = data.get("inputs")
parameters = data.get("parameters", {}) # Currently not using them
# Decode the base64 audio data
audio_data = base64.b64decode(inputs)
audio_nparray = np.frombuffer(audio_data, dtype=np.int16)
# Handle multi-channel audio (convert to mono)
if audio_nparray.ndim > 1:
audio_nparray = audio_nparray.mean(axis=0) # Average channels to create mono
# Convert to PyTorch tensor
audio_tensor = torch.from_numpy(audio_nparray).float().unsqueeze(0)
if audio_tensor.dim() == 1:
audio_tensor = audio_tensor.unsqueeze(0)
pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE}
# Run diarization pipeline
try:
diarization = self.pipeline(pyannote_input) # No num_speakers parameter
except Exception as e:
print(f"An unexpected error occurred: {e}")
return {"error": "Diarization failed unexpectedly"}
# Build a friendly JSON response
processed_diarization = [
{
"label": str(label),
"start": str(segment.start),
"stop": str(segment.end),
}
for segment, _, label in diarization.itertracks(yield_label=True)
]
return {"diarization": processed_diarization}