Spaces:
Runtime error
Runtime error
updating the app to v1.2
Browse files
app.py
CHANGED
@@ -1,52 +1,116 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
text = ''
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
vid_id = pytube.extract.video_id(url)
|
10 |
-
temp = YouTubeTranscriptApi.get_transcript(vid_id)
|
11 |
-
for t in temp:
|
12 |
-
text+=t['text']+' '
|
13 |
-
yt = YouTube(str(url))
|
14 |
-
|
15 |
-
except:
|
16 |
-
pass
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
import os
|
22 |
-
|
23 |
-
save_dir="./docs/youtube/"
|
24 |
-
os.mkdir(save_dir)
|
25 |
-
yt = YouTube(str(url))
|
26 |
-
audio = yt.streams.filter(only_audio = True).first()
|
27 |
-
out_file = audio.download(filename="audio.mp3",output_path = save_dir)
|
28 |
-
|
29 |
-
import transformers
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
)
|
41 |
-
|
42 |
-
text = temp['text']
|
43 |
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
from langchain.chains.llm import LLMChain
|
52 |
from langchain.prompts import PromptTemplate
|
@@ -56,108 +120,153 @@ def summarize_text(title,text):
|
|
56 |
import transformers
|
57 |
from transformers import BitsAndBytesConfig
|
58 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
59 |
-
|
60 |
-
# quantization_config = BitsAndBytesConfig(
|
61 |
-
# load_in_4bit=True,
|
62 |
-
# bnb_4bit_compute_dtype=torch.float16,
|
63 |
-
# bnb_4bit_quant_type="nf4",
|
64 |
-
# bnb_4bit_use_double_quant=True,
|
65 |
-
# )
|
66 |
-
|
67 |
-
# model = "nomic-ai/gpt4all-falcon"
|
68 |
-
model = "tiiuae/falcon-7b-instruct"
|
69 |
-
|
70 |
-
tokenizer = AutoTokenizer.from_pretrained(model,trust_remote_code=True,)
|
71 |
-
model = AutoModelForCausalLM.from_pretrained(model,
|
72 |
-
# trust_remote_code=True,
|
73 |
-
# quantization_config=quantization_config,
|
74 |
-
)
|
75 |
-
|
76 |
from langchain import HuggingFacePipeline
|
77 |
import torch
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
llm2 = HuggingFacePipeline(pipeline=pipeline2)
|
105 |
|
106 |
# Map
|
107 |
map_template = """
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
112 |
"""
|
113 |
-
map_prompt = PromptTemplate(
|
114 |
-
|
|
|
|
|
115 |
map_chain = LLMChain(llm=llm, prompt=map_prompt)
|
116 |
|
117 |
# Reduce - Collapse
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
"""
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
128 |
|
129 |
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
|
130 |
collapse_documents_chain = StuffDocumentsChain(
|
131 |
-
llm_chain=
|
132 |
-
|
133 |
|
134 |
# Final Reduce - Combine
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
142 |
"""
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
146 |
|
147 |
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
|
148 |
combine_documents_chain = StuffDocumentsChain(
|
149 |
-
llm_chain=
|
150 |
-
|
151 |
|
152 |
-
# Combines and
|
153 |
reduce_documents_chain = ReduceDocumentsChain(
|
154 |
# This is final chain that is called.
|
155 |
combine_documents_chain=combine_documents_chain,
|
156 |
# If documents exceed context for `StuffDocumentsChain`
|
157 |
collapse_documents_chain=collapse_documents_chain,
|
158 |
# The maximum number of tokens to group documents into.
|
159 |
-
token_max=
|
160 |
-
|
161 |
|
162 |
# Combining documents by mapping a chain over them, then combining results
|
163 |
map_reduce_chain = MapReduceDocumentsChain(
|
@@ -169,27 +278,27 @@ def summarize_text(title,text):
|
|
169 |
document_variable_name="docs",
|
170 |
# Return the results of the map steps in the output
|
171 |
return_intermediate_steps=False,
|
172 |
-
|
173 |
|
174 |
from langchain.document_loaders import TextLoader
|
175 |
from langchain.text_splitter import TokenTextSplitter
|
176 |
|
177 |
-
with open('
|
178 |
f.write(text)
|
179 |
-
loader = TextLoader("
|
180 |
doc = loader.load()
|
181 |
-
text_splitter = TokenTextSplitter(chunk_size=
|
182 |
docs = text_splitter.split_documents(doc)
|
183 |
|
184 |
-
summary = map_reduce_chain.run({'input_documents':docs, 'title':title})
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
|
192 |
-
return summary
|
193 |
|
194 |
import gradio as gr
|
195 |
import pytube
|
@@ -204,29 +313,85 @@ def get_video(url):
|
|
204 |
embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id)
|
205 |
return embed_html
|
206 |
|
207 |
-
def summarize_youtube_video(url,force_transcribe
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
211 |
|
212 |
html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
with gr.Blocks() as demo:
|
215 |
-
# gr.Markdown("Transribe a YouTube video using this demo.")
|
216 |
with gr.Row():
|
217 |
-
|
|
|
|
|
|
|
218 |
url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=")
|
219 |
-
force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.")
|
220 |
with gr.Column(scale=1):
|
221 |
-
gr.
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
title = gr.Textbox(label="Video Title",placeholder="title...")
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
with gr.Row():
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
save_dir="./docs/youtube/"
|
3 |
+
if not os.path.exists(save_dir):
|
4 |
+
os.mkdir(save_dir)
|
5 |
|
6 |
+
transcription_model_id = "openai/whisper-large"
|
7 |
+
llm_model_id = "tiiuae/falcon-7b-instruct"
|
8 |
+
|
9 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
10 |
+
import pytube
|
11 |
+
|
12 |
+
# get the transcript from YouTube
|
13 |
+
def get_yt_transcript(url):
|
14 |
text = ''
|
15 |
+
vid_id = pytube.extract.video_id(url)
|
16 |
+
temp = YouTubeTranscriptApi.get_transcript(vid_id)
|
17 |
+
for t in temp:
|
18 |
+
text+=t['text']+' '
|
19 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
from pytube import YouTube
|
22 |
+
import transformers
|
23 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
# transcribes the video
|
26 |
+
def transcribe_yt_vid(url):
|
27 |
+
# download YouTube video's audio
|
28 |
+
yt = YouTube(str(url))
|
29 |
+
audio = yt.streams.filter(only_audio = True).first()
|
30 |
+
out_file = audio.download(filename="audio.mp3",
|
31 |
+
output_path = save_dir)
|
32 |
+
|
33 |
+
# defining an automatic-speech-recognition pipeline
|
34 |
+
asr = transformers.pipeline(
|
35 |
+
"automatic-speech-recognition",
|
36 |
+
model=transcription_model_id,
|
37 |
+
device_map= 'auto',
|
38 |
+
)
|
39 |
+
|
40 |
+
# setting model config parameters
|
41 |
+
asr.model.config.forced_decoder_ids = (
|
42 |
+
asr.tokenizer.get_decoder_prompt_ids(
|
43 |
+
language="en",
|
44 |
+
task="transcribe"
|
45 |
)
|
46 |
+
)
|
|
|
47 |
|
48 |
+
# invoking the Whisper model
|
49 |
+
temp = asr(out_file,chunk_length_s=20)
|
50 |
+
text = temp['text']
|
51 |
|
52 |
+
# we can do this at the end to release GPU memory
|
53 |
+
del(asr)
|
54 |
+
torch.cuda.empty_cache()
|
55 |
+
|
56 |
+
return text
|
57 |
|
58 |
+
from pytube import YouTube
|
59 |
+
from huggingface_hub import InferenceClient
|
60 |
+
|
61 |
+
# transcribes the video using the Hugging Face Hub API
|
62 |
+
def transcribe_yt_vid_api(url,api_token):
|
63 |
+
# download YouTube video's audio
|
64 |
+
yt = YouTube(str(url))
|
65 |
+
audio = yt.streams.filter(only_audio = True).first()
|
66 |
+
out_file = audio.download(filename="audio.mp3",
|
67 |
+
output_path = save_dir)
|
68 |
+
|
69 |
+
# Initialize client for the Whisper model
|
70 |
+
client = InferenceClient(model=transcription_model_id,
|
71 |
+
token=api_token)
|
72 |
+
|
73 |
+
import librosa
|
74 |
+
import soundfile as sf
|
75 |
+
|
76 |
+
text = ''
|
77 |
+
t=20 # audio chunk length in seconds
|
78 |
+
x, sr = librosa.load(out_file, sr=None)
|
79 |
+
# This gives x as audio file in numpy array and sr as original sampling rate
|
80 |
+
# The audio needs to be split in 20 second chunks since the API call truncates the response
|
81 |
+
for i in range(0, len(x)//(t * sr)):
|
82 |
+
y = x[t * sr * i: t * sr *(i+1)]
|
83 |
+
split_path = save_dir+"audio_split.mp3"
|
84 |
+
sf.write(split_path, y, sr)
|
85 |
+
text += client.automatic_speech_recognition(split_path)
|
86 |
+
|
87 |
+
return text
|
88 |
+
|
89 |
+
def transcribe_youtube_video(url, force_transcribe=False,use_api=False,api_token=None):
|
90 |
+
|
91 |
+
yt = YouTube(str(url))
|
92 |
+
text = ''
|
93 |
+
# get the transcript from YouTube if available
|
94 |
+
try:
|
95 |
+
text = get_yt_transcript(url)
|
96 |
+
except:
|
97 |
+
pass
|
98 |
+
|
99 |
+
# transcribes the video if YouTube did not provide a transcription
|
100 |
+
# or if you want to force_transcribe anyway
|
101 |
+
if text == '' or force_transcribe:
|
102 |
+
if use_api:
|
103 |
+
text = transcribe_yt_vid_api(url,api_token=api_token)
|
104 |
+
transcript_source = 'The transcript was generated using {} via the Hugging Face Hub API.'.format(transcription_model_id)
|
105 |
+
else:
|
106 |
+
text = transcribe_yt_vid(url)
|
107 |
+
transcript_source = 'The transcript was generated using {} hosted locally.'.format(transcription_model_id)
|
108 |
+
else:
|
109 |
+
transcript_source = 'The transcript was downloaded from YouTube.'
|
110 |
+
|
111 |
+
return yt.title, text, transcript_source
|
112 |
+
|
113 |
+
def summarize_text(title,text,temperature,words,use_api=False,api_token=None,do_sample=False):
|
114 |
|
115 |
from langchain.chains.llm import LLMChain
|
116 |
from langchain.prompts import PromptTemplate
|
|
|
120 |
import transformers
|
121 |
from transformers import BitsAndBytesConfig
|
122 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
123 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
from langchain import HuggingFacePipeline
|
125 |
import torch
|
126 |
|
127 |
+
model_kwargs1 = {"temperature":temperature ,
|
128 |
+
"do_sample":do_sample,
|
129 |
+
"min_new_tokens":200-25,
|
130 |
+
"max_new_tokens":200+25
|
131 |
+
}
|
132 |
+
model_kwargs2 = {"temperature":temperature ,
|
133 |
+
"do_sample":do_sample,
|
134 |
+
"min_new_tokens":words-25,
|
135 |
+
"max_new_tokens":words+25,
|
136 |
+
'repetition_penalty':2.0
|
137 |
+
}
|
138 |
+
if not do_sample:
|
139 |
+
del model_kwargs1["temperature"]
|
140 |
+
del model_kwargs2["temperature"]
|
141 |
+
|
142 |
+
if use_api:
|
143 |
+
|
144 |
+
from langchain import HuggingFaceHub
|
145 |
|
146 |
+
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
|
147 |
+
llm=HuggingFaceHub(
|
148 |
+
repo_id=llm_model_id, model_kwargs=model_kwargs1,
|
149 |
+
huggingfacehub_api_token=api_token
|
150 |
+
)
|
151 |
+
llm2=HuggingFaceHub(
|
152 |
+
repo_id=llm_model_id, model_kwargs=model_kwargs2,
|
153 |
+
huggingfacehub_api_token=api_token
|
154 |
+
)
|
155 |
+
summary_source = 'The summary was generated using {} via Hugging Face API.'.format(llm_model_id)
|
156 |
+
|
157 |
+
else:
|
158 |
+
quantization_config = BitsAndBytesConfig(
|
159 |
+
load_in_4bit=True,
|
160 |
+
bnb_4bit_compute_dtype=torch.float16,
|
161 |
+
bnb_4bit_quant_type="nf4",
|
162 |
+
bnb_4bit_use_double_quant=True,
|
163 |
+
)
|
164 |
+
|
165 |
+
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
|
166 |
+
model = AutoModelForCausalLM.from_pretrained(llm_model_id,
|
167 |
+
quantization_config=quantization_config)
|
168 |
+
model.to_bettertransformer()
|
169 |
+
|
170 |
+
pipeline = transformers.pipeline(
|
171 |
+
"text-generation",
|
172 |
+
model=model,
|
173 |
+
tokenizer=tokenizer,
|
174 |
+
torch_dtype=torch.bfloat16,
|
175 |
+
device_map="auto",
|
176 |
+
pad_token_id=tokenizer.eos_token_id,
|
177 |
+
**model_kwargs1,
|
178 |
+
)
|
179 |
+
pipeline2 = transformers.pipeline(
|
180 |
+
"text-generation",
|
181 |
+
model=model,
|
182 |
+
tokenizer=tokenizer,
|
183 |
+
torch_dtype=torch.bfloat16,
|
184 |
+
device_map="auto",
|
185 |
+
pad_token_id=tokenizer.eos_token_id,
|
186 |
+
**model_kwargs2,
|
187 |
+
)
|
188 |
+
llm = HuggingFacePipeline(pipeline=pipeline)
|
189 |
+
llm2 = HuggingFacePipeline(pipeline=pipeline2)
|
190 |
+
|
191 |
+
summary_source = 'The summary was generated using {} hosted locally.'.format(llm_model_id)
|
192 |
|
|
|
193 |
|
194 |
# Map
|
195 |
map_template = """
|
196 |
+
You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n
|
197 |
+
You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n
|
198 |
+
Summarize the following text in a clear and concise way:\n
|
199 |
+
----------------------- \n
|
200 |
+
TITLE: `{title}`\n
|
201 |
+
TEXT:\n
|
202 |
+
`{docs}`\n
|
203 |
+
----------------------- \n
|
204 |
+
BRIEF SUMMARY:\n
|
205 |
"""
|
206 |
+
map_prompt = PromptTemplate(
|
207 |
+
template = map_template,
|
208 |
+
input_variables = ['title','docs']
|
209 |
+
)
|
210 |
map_chain = LLMChain(llm=llm, prompt=map_prompt)
|
211 |
|
212 |
# Reduce - Collapse
|
213 |
+
collapse_template = """
|
214 |
+
You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n
|
215 |
+
You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n
|
216 |
+
The following is set of partial summaries of a video:\n
|
217 |
+
----------------------- \n
|
218 |
+
TITLE: `{title}`\n
|
219 |
+
PARTIAL SUMMARIES:\n
|
220 |
+
`{doc_summaries}`\n
|
221 |
+
----------------------- \n
|
222 |
+
Take these and distill them into a consolidated summary.\n
|
223 |
+
SUMMARY:\n
|
224 |
"""
|
225 |
|
226 |
+
collapse_prompt = PromptTemplate(
|
227 |
+
template = collapse_template,
|
228 |
+
input_variables = ['title','doc_summaries']
|
229 |
+
)
|
230 |
+
collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt)
|
231 |
|
232 |
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
|
233 |
collapse_documents_chain = StuffDocumentsChain(
|
234 |
+
llm_chain=collapse_chain, document_variable_name="doc_summaries"
|
235 |
+
)
|
236 |
|
237 |
# Final Reduce - Combine
|
238 |
+
combine_template = """\n
|
239 |
+
You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n
|
240 |
+
You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n
|
241 |
+
The following is a set of partial summaries of a video:\n
|
242 |
+
----------------------- \n
|
243 |
+
TITLE: `{title}`\n
|
244 |
+
PARTIAL SUMMARIES:\n
|
245 |
+
`{doc_summaries}`\n
|
246 |
+
----------------------- \n
|
247 |
+
Generate an executive summary of the whole text in maximum {words} words that contains the main messages, points, and arguments presented in the video.\n
|
248 |
+
EXECUTIVE SUMMARY:\n
|
249 |
"""
|
250 |
+
combine_prompt = PromptTemplate(
|
251 |
+
template = combine_template,
|
252 |
+
input_variables = ['title','doc_summaries','words']
|
253 |
+
)
|
254 |
+
combine_chain = LLMChain(llm=llm2, prompt=combine_prompt)
|
255 |
|
256 |
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
|
257 |
combine_documents_chain = StuffDocumentsChain(
|
258 |
+
llm_chain=combine_chain, document_variable_name="doc_summaries"
|
259 |
+
)
|
260 |
|
261 |
+
# Combines and iteratively reduces the mapped documents
|
262 |
reduce_documents_chain = ReduceDocumentsChain(
|
263 |
# This is final chain that is called.
|
264 |
combine_documents_chain=combine_documents_chain,
|
265 |
# If documents exceed context for `StuffDocumentsChain`
|
266 |
collapse_documents_chain=collapse_documents_chain,
|
267 |
# The maximum number of tokens to group documents into.
|
268 |
+
token_max=800,
|
269 |
+
)
|
270 |
|
271 |
# Combining documents by mapping a chain over them, then combining results
|
272 |
map_reduce_chain = MapReduceDocumentsChain(
|
|
|
278 |
document_variable_name="docs",
|
279 |
# Return the results of the map steps in the output
|
280 |
return_intermediate_steps=False,
|
281 |
+
)
|
282 |
|
283 |
from langchain.document_loaders import TextLoader
|
284 |
from langchain.text_splitter import TokenTextSplitter
|
285 |
|
286 |
+
with open(save_dir+'/transcript.txt','w') as f:
|
287 |
f.write(text)
|
288 |
+
loader = TextLoader(save_dir+"/transcript.txt")
|
289 |
doc = loader.load()
|
290 |
+
text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100)
|
291 |
docs = text_splitter.split_documents(doc)
|
292 |
|
293 |
+
summary = map_reduce_chain.run({'input_documents':docs, 'title':title, 'words':words})
|
294 |
|
295 |
+
try:
|
296 |
+
del(map_reduce_chain,reduce_documents_chain,combine_chain,collapse_documents_chain,map_chain,collapse_chain,llm,llm2,pipeline,pipeline2,model,tokenizer)
|
297 |
+
except:
|
298 |
+
pass
|
299 |
+
torch.cuda.empty_cache()
|
300 |
|
301 |
+
return summary, summary_source
|
302 |
|
303 |
import gradio as gr
|
304 |
import pytube
|
|
|
313 |
embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id)
|
314 |
return embed_html
|
315 |
|
316 |
+
def summarize_youtube_video(url,force_transcribe,use_transcribe_api=False,api_token="",
|
317 |
+
temperature=1,words=250,use_llm_api=False,do_sample=False):
|
318 |
+
title,text,transcript_source = transcribe_youtube_video(url,force_transcribe,use_transcribe_api,api_token)
|
319 |
+
summary, summary_source = summarize_text(title,text,temperature,words,use_llm_api,api_token,do_sample)
|
320 |
+
return summary, text, transcript_source, summary_source
|
321 |
|
322 |
html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
|
323 |
|
324 |
+
def change_transcribe_api(vis):
|
325 |
+
return gr.Checkbox(value=False, visible=vis)
|
326 |
+
|
327 |
+
def change_api_token(vis):
|
328 |
+
return gr.Textbox(visible=vis)
|
329 |
+
|
330 |
+
def update_source(source):
|
331 |
+
return gr.Textbox(info=source)
|
332 |
+
|
333 |
+
def show_temp(vis):
|
334 |
+
return gr.Slider(visible=vis)
|
335 |
+
|
336 |
+
# Defining the structure of the UI
|
337 |
with gr.Blocks() as demo:
|
|
|
338 |
with gr.Row():
|
339 |
+
gr.Markdown("# Summarize a YouTube Video")
|
340 |
+
|
341 |
+
with gr.Row():
|
342 |
+
with gr.Column(scale=4):
|
343 |
url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=")
|
|
|
344 |
with gr.Column(scale=1):
|
345 |
+
api_token = gr.Textbox(label="Paste your Hugging Face API token here:",placeholder="hf_...",visible=False,show_label=True,info='The API token passed via this field is not stored. It is only passed through the Hugging Face Hub API for inference.')
|
346 |
+
with gr.Column(scale=1):
|
347 |
+
sum_btn = gr.Button("Summarize!")
|
348 |
+
|
349 |
+
with gr.Accordion("Transcription Settings",open=False):
|
350 |
+
with gr.Row():
|
351 |
+
force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.", info='If unchecked, the app attempts to download the transcript from YouTube first. Check this if the transcript does not seem accurate.')
|
352 |
+
use_transcribe_api = gr.Checkbox(label="Transcribe using the HuggingFaceHub API.",visible=False)
|
353 |
+
|
354 |
+
with gr.Accordion("Summarization Settings",open=False):
|
355 |
+
with gr.Row():
|
356 |
+
use_llm_api = gr.Checkbox(label="Summarize using the HuggingFaceHub API.",visible=True)
|
357 |
+
do_sample = gr.Checkbox(label="Set the Temperature",value=True,visible=True)
|
358 |
+
temperature = gr.Slider(minimum=0,maximum=1,value=0.9,label="Generation temperature",visible=True)
|
359 |
+
words = gr.Slider(minimum=100,maximum=500,value=250,label="Length of the summary")
|
360 |
+
|
361 |
+
gr.Markdown("# Results")
|
362 |
+
|
363 |
title = gr.Textbox(label="Video Title",placeholder="title...")
|
364 |
+
|
365 |
+
with gr.Row():
|
366 |
+
video = gr.HTML(html,scale=1)
|
367 |
+
summary_source = gr.Textbox(visible=False,scale=0)
|
368 |
+
summary = gr.Textbox(label="Summary",placeholder="summary...",scale=1)
|
369 |
+
|
370 |
with gr.Row():
|
371 |
+
with gr.Group():
|
372 |
+
transcript_source = gr.Textbox(visible=False)
|
373 |
+
transcript = gr.Textbox(label="Full Transcript",placeholder="transcript...",show_label=True)
|
374 |
+
|
375 |
+
with gr.Accordion("Notes",open=False):
|
376 |
+
gr.Markdown("""
|
377 |
+
1. This app attempts to download the transcript from Youtube first. If the transcript is not available, or the prompts require, the video will be transcribed.\n
|
378 |
+
2. The app performs best on videos in which the number of speakers is limited or when the YouTube transcript includes annotations of the speakers.\n
|
379 |
+
3. The trascription does not annotate the speakers which may downgrade the quality of the summary if there are more than one speaker.\n
|
380 |
+
""")
|
381 |
+
|
382 |
+
# Defining the interactivity of the UI elements
|
383 |
+
force_transcribe.change(fn=change_transcribe_api,inputs=force_transcribe,outputs=use_transcribe_api)
|
384 |
+
use_transcribe_api.change(fn=change_api_token,inputs=use_transcribe_api,outputs=api_token)
|
385 |
+
use_llm_api.change(fn=change_api_token,inputs=use_llm_api,outputs=api_token)
|
386 |
+
transcript_source.change(fn=update_source,inputs=transcript_source,outputs=transcript)
|
387 |
+
summary_source.change(fn=update_source,inputs=summary_source,outputs=summary)
|
388 |
+
do_sample.change(fn=show_temp,inputs=do_sample,outputs=temperature)
|
389 |
+
|
390 |
+
# Defining the functions to call on clicking the button
|
391 |
+
sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title", queue=False)
|
392 |
+
sum_btn.click(fn=summarize_youtube_video, inputs=[url,force_transcribe,use_transcribe_api,api_token,temperature,words,use_llm_api,do_sample],
|
393 |
+
outputs=[summary,transcript, transcript_source, summary_source], api_name="summarize_youtube_video", queue=True)
|
394 |
+
sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video", queue=False)
|
395 |
+
|
396 |
+
demo.queue()
|
397 |
+
demo.launch(share=False)
|