Spaces:
Sleeping
Sleeping
import whisper | |
import requests | |
import ffmpeg | |
import numpy as np | |
from typing import List, Dict, Any | |
def process_audio_from_url(audio_url: str) -> List[Dict[str, Any]]: | |
# Download the audio file content | |
response = requests.get(audio_url, stream=True) | |
response.raise_for_status() | |
# Use ffmpeg to decode the audio stream | |
try: | |
out, _ = ( | |
ffmpeg | |
.input('pipe:0') | |
.output('pipe:1', format='f32le', acodec='pcm_f32le', ac=1, ar='16k') | |
.run(input=response.raw.read(), capture_stdout=True, capture_stderr=True) | |
) | |
except ffmpeg.Error as e: | |
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e | |
# Convert the audio to the format Whisper expects | |
audio = np.frombuffer(out, np.float32).flatten() | |
# Load the Whisper model | |
model = whisper.load_model("base") | |
# Transcribe the audio | |
result = model.transcribe(audio) | |
segments = [] | |
for segment in result["segments"]: | |
segments.append({ | |
"file_name": audio_url.split("/")[-1], # Extract filename from URL | |
"text": segment["text"] | |
}) | |
return segments | |
def process_audio_data(audio: np.ndarray, file_name: str) -> List[Dict[str, Any]]: | |
# Load the Whisper model | |
model = whisper.load_model("base") | |
# Transcribe the audio | |
result = model.transcribe(audio) | |
segments = [] | |
for segment in result["segments"]: | |
segments.append({ | |
"file_name": file_name, # Ensure file_name is added | |
"text": segment["text"] | |
}) | |
return segments |