Ibrahim Olanigan commited on
Commit
61a06c1
·
1 Parent(s): 1e5ea64

Add Application files

Browse files
Files changed (2) hide show
  1. app.py +163 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pytube as pt
3
+ import os
4
+ import subprocess
5
+ import re
6
+ from utils import logtime, load_ffmpeg
7
+ import whisper
8
+ from langchain.document_loaders import YoutubeLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+
11
+ URL = 'URL'
12
+ TEXT = 'TEXT'
13
+ WHISPER = 'WHISPER'
14
+ PROCESSING = 'PROCESSING'
15
+ STATES = [URL, TEXT, WHISPER, PROCESSING]
16
+ AUDIO_FILE = "audio.mp3"
17
+ AUDIO_EXISTS = "AUDIO_EXISTS"
18
+ model = ''
19
+
20
+ st.title('Youtube Audio+Text')
21
+
22
+ def init_state():
23
+ if URL not in st.session_state:
24
+ st.session_state[URL] = ''
25
+
26
+ if TEXT not in st.session_state:
27
+ st.session_state[TEXT] = ''
28
+
29
+ if WHISPER not in st.session_state:
30
+ st.session_state[WHISPER] = ''
31
+
32
+ if AUDIO_EXISTS not in st.session_state:
33
+ st.session_state[AUDIO_EXISTS] = False
34
+
35
+ # if not st.session_state[URL]:
36
+ # clear_old_files()
37
+
38
+ def clear_old_files():
39
+ for file in os.listdir():
40
+ if file.endswith(".mp3") or file == 'transcript.txt':
41
+ os.remove(file)
42
+ print(f"Removed old files::{file}")
43
+
44
+
45
+ def extract_youtube_video_id(url):
46
+ regex = r"v=([^&]+)"
47
+ match = re.search(regex, url)
48
+ if match:
49
+ return match.group(1)
50
+ else:
51
+ return None
52
+
53
+ @logtime
54
+ def load_whisper():
55
+ # if not model:
56
+ model = whisper.load_model("small")
57
+ print('Loaded Whisper Medium model')
58
+ # else:
59
+ # print('Already downloaded Whisper model')
60
+ print('Transcribing with Whisper model')
61
+ result = model.transcribe("audio.mp3")
62
+ st.session_state[WHISPER] = result["text"]
63
+ write_file(result["text"], "transcript.txt")
64
+ AUDIO_FILE = "audio.mp3"
65
+
66
+ def load_audio():
67
+ if os.path.exists(AUDIO_FILE):
68
+ st.session_state[AUDIO_EXISTS] = True
69
+ audio_file = open(AUDIO_FILE, 'rb')
70
+ audio_bytes = audio_file.read()
71
+ print(f"Audio file exists...{len(audio_bytes)}")
72
+ st.audio(audio_bytes, format="audio/mp3")
73
+ elif st.session_state[AUDIO_EXISTS]:
74
+ st.session_state[AUDIO_EXISTS] = False
75
+
76
+ def display():
77
+ container = st.container()
78
+ text_container = st.container()
79
+ # whisper_container = st.container()
80
+ load_audio()
81
+
82
+ #Download Button section
83
+ col1, col2 = st.columns(2)
84
+ with col1:
85
+ if st.session_state[AUDIO_EXISTS]:
86
+ st.download_button("Download Audio","file","audio.mp3","application/octet-stream")
87
+ with col2:
88
+ if os.path.exists("transcript.txt"):
89
+ st.download_button("Download Transcript",st.session_state[TEXT],"transcript.txt","text/plain")
90
+
91
+ with container:
92
+ with st.form(key='input_form'):
93
+ user_input = st.text_input("Youtube URL:", placeholder="http://www.youtube.com", key=URL)
94
+ input_submit_button = st.form_submit_button(label='Send')
95
+
96
+ if input_submit_button and user_input:
97
+
98
+ st.write("You entered... " + st.session_state[URL])
99
+ # transcribe()
100
+ # download()
101
+ # download_audio()
102
+ load_whisper()
103
+
104
+
105
+ with text_container:
106
+ st.text_area(label="Youtube Transcript:",
107
+ height=200,
108
+ value=st.session_state[TEXT])
109
+ # with whisper_container:
110
+ # st.text_area(label="Whisper Transcript:",
111
+ # height=200,
112
+ # value=st.session_state[WHISPER])
113
+
114
+ @logtime
115
+ def download_audio():
116
+ if st.session_state[URL]:
117
+ print("Downloading....")
118
+ yt = pt.YouTube(st.session_state[URL])
119
+ stream = yt.streams.filter(only_audio=True)[0]
120
+ stream.download(filename="audio.mp3")
121
+ print("Downloaded Audio file....")
122
+
123
+ def download():
124
+ id = extract_youtube_video_id(st.session_state[URL])
125
+ command = [f"yt-dlp --no-config -v --extract-audio --audio-format mp3 {st.session_state[URL]} -o audio.mp3"]
126
+ print(command)
127
+ out = subprocess.run(command, shell=True)
128
+ print('Download with YT-DLP done!!')
129
+
130
+ @logtime
131
+ def transcribe():
132
+ loader = YoutubeLoader.from_youtube_url(
133
+ st.session_state[URL], add_video_info=True)
134
+ splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap=500)
135
+ docs = loader.load_and_split(splitter)
136
+ length = len(docs)
137
+ index = int(length/3+1)
138
+ print(f"Loaded {length} documents, Displaying {index}-th document")
139
+ # st.session_state[TEXT] = docs[index].page_content
140
+ st.session_state[TEXT] = write_chunks(docs,"transcript.txt")
141
+
142
+ @logtime
143
+ def write_chunks(docs, filename):
144
+ full_doc = ''
145
+ for doc in docs:
146
+ full_doc = full_doc + doc.page_content + "\n"
147
+ with open(filename, "w") as f:
148
+ f.write(full_doc)
149
+ return full_doc
150
+
151
+ def write_file(text, filename):
152
+ with open(filename, "w") as f:
153
+ f.write(text)
154
+ # return full_doc
155
+
156
+ def main():
157
+ # load_ffmpeg()
158
+ init_state()
159
+ display()
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ openai
2
+ langchain
3
+ youtube-transcript-api
4
+ pytube
5
+ openai-whisper