import streamlit as st from transformers import pipeline from streamlit_webrtc import webrtc_streamer, WebRtcMode, ClientSettings import av import numpy as np # ASR Model pipe = pipeline("automatic-speech-recognition", model="facebook/s2t-medium-mustc-multilingual-st") # Function to process audio frames def audio_callback(frame: av.AudioFrame) -> av.AudioFrame: audio_data = frame.to_ndarray().mean(axis=1).astype(np.int16) # Convert to mono if "audio_buffer" not in st.session_state: st.session_state.audio_buffer = b"" st.session_state.audio_buffer += audio_data.tobytes() return frame # Transcribe audio buffer def transcribe_audio(): if "audio_buffer" in st.session_state: audio_data = st.session_state.audio_buffer result = pipe(audio_data) st.write("Transcription:", result["text"]) # Streamlit UI st.title("Voice Recognition App") webrtc_streamer( key="audio", mode=WebRtcMode.SENDRECV, audio_processor_factory=lambda: audio_callback, media_stream_constraints={"audio": True, "video": False}, ) if st.button("Transcribe Audio"): transcribe_audio()