Ibrahim Olanigan
Fixed transcript download
7dfdb5c
import streamlit as st
import os
import subprocess
import whisper
URL = 'URL'
TEXT = 'TEXT'
TITLE = 'TITLE'
PROCESSING = 'PROCESSING'
AUDIO_EXISTS = "AUDIO_EXISTS"
TRANSCRIPT_EXISTS = "TRANSCRIPT_EXISTS"
STATES = [ TEXT, TITLE]
BOOL_STATES = [ AUDIO_EXISTS, TRANSCRIPT_EXISTS, PROCESSING]
AUDIO_FILE = "audio.mp3"
TRANSCRIPT = "transcript.txt"
model = ''
st.title('Youtube Assistant')
def init_state():
# print("Page refreshed")
for state in STATES:
if state not in st.session_state:
st.session_state[state] = ''
for state in BOOL_STATES:
if state not in st.session_state:
st.session_state[state] = False
def clear_old_files():
print("Clearing old files")
for file in os.listdir():
if file.endswith(".mp3") or file == TRANSCRIPT:
os.remove(file)
print(f"Removed old files::{file}")
#Refresh audio state
check_audio()
@st.cache_data
def load_whisper():
check_audio()
model = whisper.load_model("small")
print('Loaded Whisper Medium model')
return model
def transcribe():
if st.session_state[AUDIO_EXISTS]:
model = load_whisper()
result = model.transcribe("audio.mp3")
text = result["text"]
st.session_state[TEXT] = text
print(f"Start - { text[:100]}")
print(f"End - { text[-100:]}")
write_file(text, "transcript.txt")
check_audio()
write_file(str(result["segments"]), "segments.txt")
return text
def check_audio():
st.session_state[AUDIO_EXISTS] = os.path.exists(AUDIO_FILE)
st.session_state[TRANSCRIPT_EXISTS] = os.path.exists(TRANSCRIPT)
def load_audio():
if AUDIO_EXISTS in st.session_state and st.session_state[AUDIO_EXISTS]:
audio_file = open(AUDIO_FILE, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format="audio/mp3")
def display():
check_audio()
container = st.container()
text_container = st.container()
with container:
with st.form(key='input_form', clear_on_submit=False):
user_input = st.text_input("Youtube URL:", placeholder="https://www.youtube.com", key=URL)
input_submit_button = st.form_submit_button(label='Send')
if input_submit_button and user_input:
st.session_state[PROCESSING] = True
clear_old_files()
with st.spinner('Downloading Audio...'):
download()
load_audio()
with st.spinner('Transcribing Audio...'):
transcribe()
st.session_state[PROCESSING] = False
with text_container:
st.text_area(label=f"Youtube Transcript: {st.session_state[TITLE]}",
height=200,
value=st.session_state[TEXT],
)
#Download Button section
col1, col2 = st.columns(2)
with col1:
if AUDIO_EXISTS in st.session_state and st.session_state[AUDIO_EXISTS]:
with open("audio.mp3", "rb") as f:
data = f.read()
st.download_button('Download MP3', data,"audio.mp3", key="mp3")
with col2:
if st.session_state[TRANSCRIPT_EXISTS]:
if st.session_state[TEXT] == '':
with open(TRANSCRIPT, "rb") as f:
data = f.read()
# convert bytes to utf-8 string
data = data.decode("utf-8")
st.session_state[TEXT] = data
st.download_button("Download Transcript",st.session_state[TEXT],"transcript.txt", key="transcript")
def download():
#Get youtube title
text = subprocess.run(["yt-dlp", "--get-title", st.session_state[URL]], capture_output=True)
st.session_state[TITLE] = text.stdout.decode("utf-8").strip()
# Download and convert audio
command = [f"yt-dlp --no-config -v --extract-audio --audio-format mp3 {st.session_state[URL]} -o audio.mp3"]
print(command)
subprocess.run(command, shell=True)
check_audio()
def write_file(text, filename):
with open(filename, "w") as f:
f.write(text)
def main():
init_state()
display()
if __name__ == "__main__":
main()