verbisense / src /audio_processor.py
HARISH20205's picture
first
c8c7a9e
import whisper
import requests
import ffmpeg
import numpy as np
from typing import List, Dict, Any
def process_audio_from_url(audio_url: str) -> List[Dict[str, Any]]:
# Download the audio file content
response = requests.get(audio_url, stream=True)
response.raise_for_status()
# Use ffmpeg to decode the audio stream
try:
out, _ = (
ffmpeg
.input('pipe:0')
.output('pipe:1', format='f32le', acodec='pcm_f32le', ac=1, ar='16k')
.run(input=response.raw.read(), capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
# Convert the audio to the format Whisper expects
audio = np.frombuffer(out, np.float32).flatten()
# Load the Whisper model
model = whisper.load_model("base")
# Transcribe the audio
result = model.transcribe(audio)
segments = []
for segment in result["segments"]:
segments.append({
"file_name": audio_url.split("/")[-1], # Extract filename from URL
"text": segment["text"]
})
return segments
def process_audio_data(audio: np.ndarray, file_name: str) -> List[Dict[str, Any]]:
# Load the Whisper model
model = whisper.load_model("base")
# Transcribe the audio
result = model.transcribe(audio)
segments = []
for segment in result["segments"]:
segments.append({
"file_name": file_name, # Ensure file_name is added
"text": segment["text"]
})
return segments