from typing import Dict from faster_whisper import WhisperModel, Streaming import io import re class EndpointHandler: def __init__(self, model_dir=None): # Set model size, assuming installation has been done with appropriate model files and setup model_size = "large-v2" if model_dir is None else model_dir # Change to 'cuda' to use the GPU, and set compute_type for faster computation self.model = WhisperModel(model_size, device="cuda", compute_type="float16") def __call__(self, data: Dict) -> Dict[str, str]: # Process the input data expected to be in 'inputs' key containing audio file bytes audio_bytes = data["inputs"] # Convert bytes to a file-like object audio_file = io.BytesIO(audio_bytes) # Enable VAD and perform transcription using the model with a reduced beam size streaming = Streaming(device="cuda", compute_type="float16", vad=True) segments, info = streaming.transcribe(audio_file, beam_size=1) # Compile the results into a text string and extract language information # Strip leading and trailing whitespace and replace multiple spaces with a single space text = " ".join(segment.text.strip() for segment in segments) text = re.sub(' +', ' ', text) language_code = info.language language_prob = info.language_probability # Compile the response dictionary result = { "text": text, "language": language_code, "language_probability": language_prob } return result