import gradio as gr import soundfile as sf import wave from pyannote.audio import Pipeline import torch import os pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.0", use_auth_token=os.getenv("HF_AUTH_TOKEN")) pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) def process_audio(audio_file): diarization = pipeline(audio_file) with open("audio.rttm", "w") as rttm: diarization.write_rttm(rttm) speaker_durations = {} first_speaker = None with open("audio.rttm", "r") as file: for line in file: parts = line.strip().split() speaker = parts[7] start_time = float(parts[3]) duration = float(parts[4]) if first_speaker is None: first_speaker = speaker if speaker not in speaker_durations: speaker_durations[speaker] = 0 speaker_durations[speaker] += duration total_duration = sum(speaker_durations.values()) first_speaker_duration = speaker_durations.get(first_speaker, 0) percentage_first_speaker = (first_speaker_duration / total_duration) * 100 if total_duration > 0 else 0 return percentage_first_speaker def record_and_process(audio): if audio is None: return "No audio was recorded. Please try again." sample_rate, audio_data = audio file_path = "audio.wav" sf.write(file_path, audio_data, sample_rate) percentage = process_audio(file_path) return f"Percentage of time spoken by the first speaker: {percentage:.2f}%" interface = gr.Interface( fn=record_and_process, inputs=gr.Audio(type="numpy"), outputs="text", title="See How Much You Talk in a Conversation", description=( "Make sure you are the first person to speak!
" "You can also use a sample audio file for testing: " "sample audio." ), allow_flagging="never" ) interface.launch()