from typing import Dict from pyannote.audio import Pipeline import torch import base64 import numpy as np import os SAMPLE_RATE = 16000 class EndpointHandler(): def __init__(self, path=""): # Retrieve the Hugging Face authentication token from the environment variable hf_token = os.getenv("MY_KEY") if not hf_token: raise ValueError("Hugging Face authentication token (MY_KEY) is missing.") # Initialize the pipeline with the authentication token self.pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.1", use_auth_token=hf_token ) # Move the pipeline to the appropriate device (CPU or GPU) self.pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) def __call__(self, data: Dict) -> Dict: """ Args: data (Dict): 'inputs': Base64-encoded audio bytes 'parameters': Additional diarization parameters (currently unused) Return: Dict: Speaker diarization results """ inputs = data.get("inputs") parameters = data.get("parameters", {}) # Currently not using them # Decode the base64 audio data audio_data = base64.b64decode(inputs) audio_nparray = np.frombuffer(audio_data, dtype=np.int16) # Handle multi-channel audio (convert to mono) if audio_nparray.ndim > 1: audio_nparray = audio_nparray.mean(axis=0) # Average channels to create mono # Convert to PyTorch tensor audio_tensor = torch.from_numpy(audio_nparray).float().unsqueeze(0) if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) pyannote_input = {"waveform": audio_tensor, "sample_rate": SAMPLE_RATE} # Run diarization pipeline try: diarization = self.pipeline(pyannote_input) # No num_speakers parameter except Exception as e: print(f"An unexpected error occurred: {e}") return {"error": "Diarization failed unexpectedly"} # Build a friendly JSON response processed_diarization = [ { "label": str(label), "start": str(segment.start), "stop": str(segment.end), } for segment, _, label in diarization.itertracks(yield_label=True) ] return {"diarization": processed_diarization}