BatuhanYilmaz commited on
Commit
613b97e
·
1 Parent(s): 480e8fe
.gitattributes DELETED
@@ -1,31 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ftz filter=lfs diff=lfs merge=lfs -text
6
- *.gz filter=lfs diff=lfs merge=lfs -text
7
- *.h5 filter=lfs diff=lfs merge=lfs -text
8
- *.joblib filter=lfs diff=lfs merge=lfs -text
9
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
- *.model filter=lfs diff=lfs merge=lfs -text
11
- *.msgpack filter=lfs diff=lfs merge=lfs -text
12
- *.npy filter=lfs diff=lfs merge=lfs -text
13
- *.npz filter=lfs diff=lfs merge=lfs -text
14
- *.onnx filter=lfs diff=lfs merge=lfs -text
15
- *.ot filter=lfs diff=lfs merge=lfs -text
16
- *.parquet filter=lfs diff=lfs merge=lfs -text
17
- *.pickle filter=lfs diff=lfs merge=lfs -text
18
- *.pkl filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pt filter=lfs diff=lfs merge=lfs -text
21
- *.pth filter=lfs diff=lfs merge=lfs -text
22
- *.rar filter=lfs diff=lfs merge=lfs -text
23
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
- *.tar.* filter=lfs diff=lfs merge=lfs -text
25
- *.tflite filter=lfs diff=lfs merge=lfs -text
26
- *.tgz filter=lfs diff=lfs merge=lfs -text
27
- *.wasm filter=lfs diff=lfs merge=lfs -text
28
- *.xz filter=lfs diff=lfs merge=lfs -text
29
- *.zip filter=lfs diff=lfs merge=lfs -text
30
- *.zst filter=lfs diff=lfs merge=lfs -text
31
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.streamlit/config.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#F63366"
3
+ backgroundColor="#FFFFFF"
4
+ secondaryBackgroundColor="#F0F2F6"
5
+ textColor="#262730"
6
+ font="sans serif"
7
+ [server]
8
+ maxUploadSize=1028
app.py → 01_🎥_Input_YouTube_Link.py RENAMED
@@ -75,7 +75,7 @@ def change_model(current_size, size):
75
  @st.cache(allow_output_mutation=True)
76
  def inference(link, loaded_model, task):
77
  yt = YouTube(link)
78
- path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
79
  if task == "Transcribe":
80
  options = dict(task="transcribe", best_of=5)
81
  results = loaded_model.transcribe(path, **options)
@@ -153,18 +153,18 @@ def main():
153
  with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
154
  datatxt = f.read()
155
 
156
-
157
- with open("transcript.vtt", "w+",encoding='utf8') as f:
158
- f.writelines(results[1])
159
- f.close()
160
- with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
161
- datavtt = f.read()
162
 
163
- with open("transcript.srt", "w+",encoding='utf8') as f:
164
- f.writelines(results[2])
165
- f.close()
166
- with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
167
- datasrt = f.read()
 
168
  with col5:
169
  st.download_button(label="Download Transcript (.txt)",
170
  data=datatxt,
@@ -184,7 +184,7 @@ def main():
184
 
185
  with col4:
186
  with st.spinner("Generating Subtitled Video"):
187
- video_with_subs = generate_subtitled_video(video, "audio.mp4", "transcript.srt")
188
  st.video(video_with_subs)
189
  st.balloons()
190
  with col8:
@@ -212,7 +212,6 @@ def main():
212
  with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
213
  datatxt = f.read()
214
 
215
-
216
  with open("transcript.vtt", "w+",encoding='utf8') as f:
217
  f.writelines(results[1])
218
  f.close()
@@ -243,7 +242,7 @@ def main():
243
 
244
  with col4:
245
  with st.spinner("Generating Subtitled Video"):
246
- video_with_subs = generate_subtitled_video(video, "audio.mp4", "transcript.srt")
247
  st.video(video_with_subs)
248
  st.balloons()
249
  with col8:
 
75
  @st.cache(allow_output_mutation=True)
76
  def inference(link, loaded_model, task):
77
  yt = YouTube(link)
78
+ path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp3")
79
  if task == "Transcribe":
80
  options = dict(task="transcribe", best_of=5)
81
  results = loaded_model.transcribe(path, **options)
 
153
  with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
154
  datatxt = f.read()
155
 
156
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
157
+ f.writelines(results[1])
158
+ f.close()
159
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
160
+ datavtt = f.read()
 
161
 
162
+ with open("transcript.srt", "w+",encoding='utf8') as f:
163
+ f.writelines(results[2])
164
+ f.close()
165
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
166
+ datasrt = f.read()
167
+
168
  with col5:
169
  st.download_button(label="Download Transcript (.txt)",
170
  data=datatxt,
 
184
 
185
  with col4:
186
  with st.spinner("Generating Subtitled Video"):
187
+ video_with_subs = generate_subtitled_video(video, "audio.mp3", "transcript.srt")
188
  st.video(video_with_subs)
189
  st.balloons()
190
  with col8:
 
212
  with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
213
  datatxt = f.read()
214
 
 
215
  with open("transcript.vtt", "w+",encoding='utf8') as f:
216
  f.writelines(results[1])
217
  f.close()
 
242
 
243
  with col4:
244
  with st.spinner("Generating Subtitled Video"):
245
+ video_with_subs = generate_subtitled_video(video, "audio.mp3", "transcript.srt")
246
  st.video(video_with_subs)
247
  st.balloons()
248
  with col8:
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Batuhan Yılmaz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,22 @@
1
- ---
2
- title: Auto Subtitled Video Generator
3
- emoji: 📚
4
- colorFrom: yellow
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.10.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Auto-Subtitled-Video-Generator
 
 
 
 
 
 
 
 
 
 
2
 
3
+ ![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
4
+ ![Streamlit](https://img.shields.io/badge/Streamlit-FF4B4B?style=for-the-badge&logo=Streamlit&logoColor=white)
5
+ ![OpenAI](https://camo.githubusercontent.com/ea872adb9aba9cf6b4e976262f6d4b83b97972d0d5a7abccfde68eb2ae55325f/68747470733a2f2f696d672e736869656c64732e696f2f7374617469632f76313f7374796c653d666f722d7468652d6261646765266d6573736167653d4f70656e414926636f6c6f723d343132393931266c6f676f3d4f70656e4149266c6f676f436f6c6f723d464646464646266c6162656c3d)
6
+
7
+ #### About this project
8
+ - This project is an automatic speech recognition application that takes a YouTube video link or a video file as input to generate a video with subtitles.
9
+ - You can also upload an audio file to generate a transcript as .txt, .vtt, .srt files.
10
+ - The application performs 2 tasks:
11
+ - Detects the language, transcribes the input video in its original language.
12
+ - Detects the language, translates it into English and then transcribes.
13
+ - Downloaded the video of the input link using [pytube](https://github.com/pytube/pytube).
14
+ - Generated a transcription of the video using the [OpenAI Whisper](https://openai.com/blog/whisper) model.
15
+ - Saved the transcriptions as .txt, .vtt and .srt files.
16
+ - Generated a subtitled version of the input video using [ffmpeg](https://github.com/FFmpeg).
17
+ - Displayed the original video and the subtitled video side by side.
18
+ - Built a multipage web app using [Streamlit](https://streamlit.io) and hosted on [HuggingFace Spaces](https://huggingface.co/spaces).
19
+ - You can download the generated .txt, .vtt, .srt files and the subtitled video.
20
+ - You can use the app via this [link](https://huggingface.co/spaces/BatuhanYilmaz/Auto-Subtitled-Video-Generator).
21
+
22
+ ![](auto-sub.gif)
pages DELETED
File without changes
pages/02_📼_Upload_Video_File.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import streamlit as st
3
+ from streamlit_lottie import st_lottie
4
+ from utils import write_vtt, write_srt
5
+ import ffmpeg
6
+ import requests
7
+ from typing import Iterator
8
+ from io import StringIO
9
+ import numpy as np
10
+ import pathlib
11
+ import os
12
+
13
+ st.set_page_config(page_title="Auto Subtitled Video Generator", page_icon=":movie_camera:", layout="wide")
14
+
15
+ # Define a function that we can use to load lottie files from a link.
16
+ @st.cache(allow_output_mutation=True)
17
+ def load_lottieurl(url: str):
18
+ r = requests.get(url)
19
+ if r.status_code != 200:
20
+ return None
21
+ return r.json()
22
+
23
+
24
+ APP_DIR = pathlib.Path(__file__).parent.absolute()
25
+
26
+ LOCAL_DIR = APP_DIR / "local"
27
+ LOCAL_DIR.mkdir(exist_ok=True)
28
+ save_dir = LOCAL_DIR / "output"
29
+ save_dir.mkdir(exist_ok=True)
30
+
31
+
32
+ loaded_model = whisper.load_model("base")
33
+ current_size = "None"
34
+
35
+
36
+ col1, col2 = st.columns([1, 3])
37
+ with col1:
38
+ lottie = load_lottieurl("https://assets1.lottiefiles.com/packages/lf20_HjK9Ol.json")
39
+ st_lottie(lottie, speed=1, height=250, width=250)
40
+
41
+ with col2:
42
+ st.write("""
43
+ ## Auto Subtitled Video Generator
44
+ ##### Upload a video file and get a video with subtitles.
45
+ ###### ➠ If you want to transcribe the video in its original language, select the task as "Transcribe"
46
+ ###### ➠ If you want to translate the subtitles to English, select the task as "Translate"
47
+ ###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
48
+
49
+
50
+ @st.cache(allow_output_mutation=True)
51
+ def change_model(current_size, size):
52
+ if current_size != size:
53
+ loaded_model = whisper.load_model(size)
54
+ return loaded_model
55
+ else:
56
+ raise Exception("Model size is the same as the current size.")
57
+
58
+
59
+ @st.cache(allow_output_mutation=True)
60
+ def inferecence(loaded_model, uploaded_file, task):
61
+ with open(f"{save_dir}/input.mp4", "wb") as f:
62
+ f.write(uploaded_file.read())
63
+ audio = ffmpeg.input(f"{save_dir}/input.mp4")
64
+ audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
65
+ ffmpeg.run(audio, overwrite_output=True)
66
+ if task == "Transcribe":
67
+ options = dict(task="transcribe", best_of=5)
68
+ results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
69
+ vtt = getSubs(results["segments"], "vtt", 80)
70
+ srt = getSubs(results["segments"], "srt", 80)
71
+ lang = results["language"]
72
+ return results["text"], vtt, srt, lang
73
+ elif task == "Translate":
74
+ options = dict(task="translate", best_of=5)
75
+ results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
76
+ vtt = getSubs(results["segments"], "vtt", 80)
77
+ srt = getSubs(results["segments"], "srt", 80)
78
+ lang = results["language"]
79
+ return results["text"], vtt, srt, lang
80
+ else:
81
+ raise ValueError("Task not supported")
82
+
83
+
84
+ def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
85
+ segmentStream = StringIO()
86
+
87
+ if format == 'vtt':
88
+ write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
89
+ elif format == 'srt':
90
+ write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
91
+ else:
92
+ raise Exception("Unknown format " + format)
93
+
94
+ segmentStream.seek(0)
95
+ return segmentStream.read()
96
+
97
+
98
+ def generate_subtitled_video(video, audio, transcript):
99
+ video_file = ffmpeg.input(video)
100
+ audio_file = ffmpeg.input(audio)
101
+ ffmpeg.concat(video_file.filter("subtitles", transcript), audio_file, v=1, a=1).output("final.mp4").run(quiet=True, overwrite_output=True)
102
+ video_with_subs = open("final.mp4", "rb")
103
+ return video_with_subs
104
+
105
+
106
+ def main():
107
+ size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
108
+ loaded_model = change_model(current_size, size)
109
+ st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
110
+ f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
111
+ input_file = st.file_uploader("File", type=["mp4", "avi", "mov", "mkv"])
112
+ # get the name of the input_file
113
+ if input_file is not None:
114
+ filename = input_file.name[:-4]
115
+ else:
116
+ filename = None
117
+ task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
118
+ if task == "Transcribe":
119
+ if st.button("Transcribe"):
120
+ results = inferecence(loaded_model, input_file, task)
121
+ col3, col4 = st.columns(2)
122
+ col5, col6, col7, col8 = st.columns(4)
123
+ col9, col10 = st.columns(2)
124
+ with col3:
125
+ st.video(input_file)
126
+
127
+ with open("transcript.txt", "w+", encoding='utf8') as f:
128
+ f.writelines(results[0])
129
+ f.close()
130
+ with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
131
+ datatxt = f.read()
132
+
133
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
134
+ f.writelines(results[1])
135
+ f.close()
136
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
137
+ datavtt = f.read()
138
+
139
+ with open("transcript.srt", "w+",encoding='utf8') as f:
140
+ f.writelines(results[2])
141
+ f.close()
142
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
143
+ datasrt = f.read()
144
+
145
+ with col5:
146
+ st.download_button(label="Download Transcript (.txt)",
147
+ data=datatxt,
148
+ file_name="transcript.txt")
149
+ with col6:
150
+ st.download_button(label="Download Transcript (.vtt)",
151
+ data=datavtt,
152
+ file_name="transcript.vtt")
153
+ with col7:
154
+ st.download_button(label="Download Transcript (.srt)",
155
+ data=datasrt,
156
+ file_name="transcript.srt")
157
+ with col9:
158
+ st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
159
+ with col10:
160
+ st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
161
+
162
+ with col4:
163
+ with st.spinner("Generating Subtitled Video"):
164
+ video_with_subs = generate_subtitled_video(f"{save_dir}/input.mp4", f"{save_dir}/output.wav", "transcript.srt")
165
+ st.video(video_with_subs)
166
+ st.snow()
167
+ with col8:
168
+ st.download_button(label="Download Video with Subtitles",
169
+ data=video_with_subs,
170
+ file_name=f"{filename}_with_subs.mp4")
171
+ elif task == "Translate":
172
+ if st.button("Translate to English"):
173
+ results = inferecence(loaded_model, input_file, task)
174
+ col3, col4 = st.columns(2)
175
+ col5, col6, col7, col8 = st.columns(4)
176
+ col9, col10 = st.columns(2)
177
+ with col3:
178
+ st.video(input_file)
179
+
180
+ with open("transcript.txt", "w+", encoding='utf8') as f:
181
+ f.writelines(results[0])
182
+ f.close()
183
+ with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
184
+ datatxt = f.read()
185
+
186
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
187
+ f.writelines(results[1])
188
+ f.close()
189
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
190
+ datavtt = f.read()
191
+
192
+ with open("transcript.srt", "w+",encoding='utf8') as f:
193
+ f.writelines(results[2])
194
+ f.close()
195
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
196
+ datasrt = f.read()
197
+
198
+ with col5:
199
+ st.download_button(label="Download Transcript (.txt)",
200
+ data=datatxt,
201
+ file_name="transcript.txt")
202
+ with col6:
203
+ st.download_button(label="Download Transcript (.vtt)",
204
+ data=datavtt,
205
+ file_name="transcript.vtt")
206
+ with col7:
207
+ st.download_button(label="Download Transcript (.srt)",
208
+ data=datasrt,
209
+ file_name="transcript.srt")
210
+ with col9:
211
+ st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
212
+ with col10:
213
+ st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
214
+
215
+ with col4:
216
+ with st.spinner("Generating Subtitled Video"):
217
+ video_with_subs = generate_subtitled_video(f"{save_dir}/input.mp4", f"{save_dir}/output.wav", "transcript.srt")
218
+ st.video(video_with_subs)
219
+ st.snow()
220
+ with col8:
221
+ st.download_button(label="Download Video with Subtitles",
222
+ data=video_with_subs,
223
+ file_name=f"{filename}_with_subs.mp4")
224
+ else:
225
+ st.error("Please select a task.")
226
+
227
+
228
+ if __name__ == "__main__":
229
+ main()
230
+ st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [![this is an image link](https://i.imgur.com/thJhzOO.png)](https://www.buymeacoffee.com/batuhanylmz)")
pages/03_🔊_Upload_Audio_File.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import streamlit as st
3
+ from streamlit_lottie import st_lottie
4
+ from utils import write_vtt, write_srt
5
+ import ffmpeg
6
+ import requests
7
+ from typing import Iterator
8
+ from io import StringIO
9
+ import numpy as np
10
+ import pathlib
11
+ import os
12
+
13
+ st.set_page_config(page_title="Auto Transcriber", page_icon="🔊", layout="wide")
14
+
15
+ # Define a function that we can use to load lottie files from a link.
16
+ @st.cache(allow_output_mutation=True)
17
+ def load_lottieurl(url: str):
18
+ r = requests.get(url)
19
+ if r.status_code != 200:
20
+ return None
21
+ return r.json()
22
+
23
+
24
+ APP_DIR = pathlib.Path(__file__).parent.absolute()
25
+
26
+ LOCAL_DIR = APP_DIR / "local_audio"
27
+ LOCAL_DIR.mkdir(exist_ok=True)
28
+ save_dir = LOCAL_DIR / "output"
29
+ save_dir.mkdir(exist_ok=True)
30
+
31
+
32
+ col1, col2 = st.columns([1, 3])
33
+ with col1:
34
+ lottie = load_lottieurl("https://assets1.lottiefiles.com/packages/lf20_1xbk4d2v.json")
35
+ st_lottie(lottie, speed=1, height=250, width=250)
36
+
37
+ with col2:
38
+ st.write("""
39
+ ## Auto Transcriber
40
+ ##### Input an audio file and get a transcript.
41
+ ###### ➠ If you want to transcribe the audio in its original language, select the task as "Transcribe"
42
+ ###### ➠ If you want to translate the transcription to English, select the task as "Translate"
43
+ ###### I recommend starting with the base model and then experimenting with the larger models, the small and medium models often work well. """)
44
+
45
+ loaded_model = whisper.load_model("base")
46
+ current_size = "None"
47
+
48
+
49
+ @st.cache(allow_output_mutation=True)
50
+ def change_model(current_size, size):
51
+ if current_size != size:
52
+ loaded_model = whisper.load_model(size)
53
+ return loaded_model
54
+ else:
55
+ raise Exception("Model size is the same as the current size.")
56
+
57
+ @st.cache(allow_output_mutation=True)
58
+ def inferecence(loaded_model, uploaded_file, task):
59
+ with open(f"{save_dir}/input.mp3", "wb") as f:
60
+ f.write(uploaded_file.read())
61
+ audio = ffmpeg.input(f"{save_dir}/input.mp3")
62
+ audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
63
+ ffmpeg.run(audio, overwrite_output=True)
64
+ if task == "Transcribe":
65
+ options = dict(task="transcribe", best_of=5)
66
+ results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
67
+ vtt = getSubs(results["segments"], "vtt", 80)
68
+ srt = getSubs(results["segments"], "srt", 80)
69
+ lang = results["language"]
70
+ return results["text"], vtt, srt, lang
71
+ elif task == "Translate":
72
+ options = dict(task="translate", best_of=5)
73
+ results = loaded_model.transcribe(f"{save_dir}/output.wav", **options)
74
+ vtt = getSubs(results["segments"], "vtt", 80)
75
+ srt = getSubs(results["segments"], "srt", 80)
76
+ lang = results["language"]
77
+ return results["text"], vtt, srt, lang
78
+ else:
79
+ raise ValueError("Task not supported")
80
+
81
+
82
+ def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
83
+ segmentStream = StringIO()
84
+
85
+ if format == 'vtt':
86
+ write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
87
+ elif format == 'srt':
88
+ write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
89
+ else:
90
+ raise Exception("Unknown format " + format)
91
+
92
+ segmentStream.seek(0)
93
+ return segmentStream.read()
94
+
95
+
96
+ def main():
97
+ size = st.selectbox("Select Model Size (The larger the model, the more accurate the transcription will be, but it will take longer)", ["tiny", "base", "small", "medium", "large"], index=1)
98
+ loaded_model = change_model(current_size, size)
99
+ st.write(f"Model is {'multilingual' if loaded_model.is_multilingual else 'English-only'} "
100
+ f"and has {sum(np.prod(p.shape) for p in loaded_model.parameters()):,} parameters.")
101
+ input_file = st.file_uploader("Upload an audio file", type=["mp3", "wav", "m4a"])
102
+ if input_file is not None:
103
+ filename = input_file.name[:-4]
104
+ else:
105
+ filename = None
106
+ task = st.selectbox("Select Task", ["Transcribe", "Translate"], index=0)
107
+ if task == "Transcribe":
108
+ if st.button("Transcribe"):
109
+ results = inferecence(loaded_model, input_file, task)
110
+ col3, col4 = st.columns(2)
111
+ col5, col6, col7 = st.columns(3)
112
+ col9, col10 = st.columns(2)
113
+
114
+ with col3:
115
+ st.audio(input_file)
116
+
117
+ with open("transcript.txt", "w+", encoding='utf8') as f:
118
+ f.writelines(results[0])
119
+ f.close()
120
+ with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
121
+ datatxt = f.read()
122
+
123
+
124
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
125
+ f.writelines(results[1])
126
+ f.close()
127
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
128
+ datavtt = f.read()
129
+
130
+ with open("transcript.srt", "w+",encoding='utf8') as f:
131
+ f.writelines(results[2])
132
+ f.close()
133
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
134
+ datasrt = f.read()
135
+
136
+ with col5:
137
+ st.download_button(label="Download Transcript (.txt)",
138
+ data=datatxt,
139
+ file_name="transcript.txt")
140
+ with col6:
141
+ st.download_button(label="Download Transcript (.vtt)",
142
+ data=datavtt,
143
+ file_name="transcript.vtt")
144
+ with col7:
145
+ st.download_button(label="Download Transcript (.srt)",
146
+ data=datasrt,
147
+ file_name="transcript.srt")
148
+ with col9:
149
+ st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
150
+ with col10:
151
+ st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
152
+
153
+ elif task == "Translate":
154
+ if st.button("Translate to English"):
155
+ results = inferecence(loaded_model, input_file, task)
156
+ col3, col4 = st.columns(2)
157
+ col5, col6, col7 = st.columns(3)
158
+ col9, col10 = st.columns(2)
159
+
160
+ with col3:
161
+ st.audio(input_file)
162
+
163
+ with open("transcript.txt", "w+", encoding='utf8') as f:
164
+ f.writelines(results[0])
165
+ f.close()
166
+ with open(os.path.join(os.getcwd(), "transcript.txt"), "rb") as f:
167
+ datatxt = f.read()
168
+
169
+
170
+ with open("transcript.vtt", "w+",encoding='utf8') as f:
171
+ f.writelines(results[1])
172
+ f.close()
173
+ with open(os.path.join(os.getcwd(), "transcript.vtt"), "rb") as f:
174
+ datavtt = f.read()
175
+
176
+ with open("transcript.srt", "w+",encoding='utf8') as f:
177
+ f.writelines(results[2])
178
+ f.close()
179
+ with open(os.path.join(os.getcwd(), "transcript.srt"), "rb") as f:
180
+ datasrt = f.read()
181
+
182
+ with col5:
183
+ st.download_button(label="Download Transcript (.txt)",
184
+ data=datatxt,
185
+ file_name="transcript.txt")
186
+ with col6:
187
+ st.download_button(label="Download Transcript (.vtt)",
188
+ data=datavtt,
189
+ file_name="transcript.vtt")
190
+ with col7:
191
+ st.download_button(label="Download Transcript (.srt)",
192
+ data=datasrt,
193
+ file_name="transcript.srt")
194
+ with col9:
195
+ st.success("You can download the transcript in .srt format, edit it (if you need to) and upload it to YouTube to create subtitles for your video.")
196
+ with col10:
197
+ st.info("Streamlit refreshes after the download button is clicked. The data is cached so you can download the transcript again without having to transcribe the video again.")
198
+
199
+ else:
200
+ st.error("Please select a task.")
201
+
202
+
203
+ if __name__ == "__main__":
204
+ main()
205
+ st.markdown("###### Made with :heart: by [@BatuhanYılmaz](https://twitter.com/batuhan3326) [![this is an image link](https://i.imgur.com/thJhzOO.png)](https://www.buymeacoffee.com/batuhanylmz)")