adhim commited on
Commit
74b5695
·
1 Parent(s): aa0a99f

app.py updated & requirements.txt added

Browse files
Files changed (2) hide show
  1. app.py +781 -4
  2. requirements.txt +7 -0
app.py CHANGED
@@ -1,7 +1,784 @@
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 0- libraries
2
+ import transformers
3
  import gradio as gr
4
 
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ from huggingface_hub import InferenceClient
7
+ from pytube import YouTube
8
+ import pytube
9
+ import torch
10
 
11
+ # 1 - abstractive_summary
12
+ # 1.1 - initialize
13
+ import os
14
+ save_dir = os.path.join(os.getcwd(), "docs")
15
+ if not os.path.exists(save_dir):
16
+ os.mkdir(save_dir)
17
+ transcription_model_id = "openai/whisper-large"
18
+ llm_model_id = "tiiuae/falcon-7b-instruct"
19
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
20
+
21
+ # 1.2 - transcription
22
+ def get_yt_transcript(url):
23
+ text = ""
24
+ vid_id = pytube.extract.video_id(url)
25
+ temp = YouTubeTranscriptApi.get_transcript(vid_id)
26
+ for t in temp:
27
+ text += t["text"] + " "
28
+ return text
29
+
30
+ # 1.2.1 - locally_transcribe
31
+ def transcribe_yt_vid(url):
32
+ # download YouTube video's audio
33
+ yt = YouTube(str(url))
34
+ audio = yt.streams.filter(only_audio=True).first()
35
+ out_file = audio.download(filename="audio.mp3", output_path=save_dir)
36
+
37
+ # defining an automatic-speech-recognition pipeline
38
+ asr = transformers.pipeline(
39
+ "automatic-speech-recognition",
40
+ model=transcription_model_id,
41
+ device_map="auto",
42
+ )
43
+
44
+ # setting model config parameters
45
+ asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
46
+ language="en", task="transcribe"
47
+ )
48
+
49
+ # invoking the Whisper model
50
+ temp = asr(out_file, chunk_length_s=20)
51
+ text = temp["text"]
52
+
53
+ # we can do this at the end to release GPU memory
54
+ del asr
55
+ torch.cuda.empty_cache()
56
+
57
+ return text
58
+
59
+ # 1.2.1 - api_transcribe
60
+ def transcribe_yt_vid_api(url, api_token):
61
+ # download YouTube video's audio
62
+ yt = YouTube(str(url))
63
+ audio = yt.streams.filter(only_audio=True).first()
64
+ out_file = audio.download(filename="audio.wav", output_path=save_dir)
65
+
66
+ # Initialize client for the Whisper model
67
+ client = InferenceClient(model=transcription_model_id, token=api_token)
68
+
69
+ import librosa
70
+ import soundfile as sf
71
+
72
+ text = ""
73
+ t = 25 # audio chunk length in seconds
74
+ x, sr = librosa.load(out_file, sr=None)
75
+ # This gives x as audio file in numpy array and sr as original sampling rate
76
+ # The audio needs to be split in 20 second chunks since the API call truncates the response
77
+ for _, i in enumerate(range(0, (len(x) // (t * sr)) + 1)):
78
+ y = x[t * sr * i : t * sr * (i + 1)]
79
+ split_path = os.path.join(save_dir, "audio_split.wav")
80
+ sf.write(split_path, y, sr)
81
+ text += client.automatic_speech_recognition(split_path)
82
+
83
+ return text
84
+
85
+
86
+ # 1.2.3 - transcribe locally or api
87
+ def transcribe_youtube_video(url, force_transcribe=False, use_api=False, api_token=None):
88
+
89
+ yt = YouTube(str(url))
90
+ text = ""
91
+ # get the transcript from YouTube if available
92
+ try:
93
+ text = get_yt_transcript(url)
94
+ except:
95
+ pass
96
+
97
+ # transcribes the video if YouTube did not provide a transcription
98
+ # or if you want to force_transcribe anyway
99
+ if text == "" or force_transcribe:
100
+ if use_api:
101
+ text = transcribe_yt_vid_api(url, api_token=api_token)
102
+ transcript_source = "The transcript was generated using {} via the Hugging Face Hub API.".format(
103
+ transcription_model_id
104
+ )
105
+ else:
106
+ text = transcribe_yt_vid(url)
107
+ transcript_source = (
108
+ "The transcript was generated using {} hosted locally.".format(
109
+ transcription_model_id
110
+ )
111
+ )
112
+ else:
113
+ transcript_source = "The transcript was downloaded from YouTube."
114
+
115
+ return yt.title, text, transcript_source
116
+
117
+
118
+ # 1.3 - turn to paragraph or points
119
+ def turn_to_paragraph(text):
120
+ # REMOVE HTML TAGS
121
+ from bs4 import BeautifulSoup
122
+
123
+ # Parse the HTML text
124
+ soup = BeautifulSoup(text, "html.parser")
125
+ # Get the text without HTML tags
126
+ text = soup.get_text() # print(text_without_tags)
127
+
128
+ # Remove leading and trailing whitespaces
129
+ text = text.strip()
130
+ # Check if the string ends with "User" and remove it
131
+ if text.endswith("User"):
132
+ text = text[: -len("User")]
133
+ # Replace dashes and extra whitespaces with spaces
134
+ text = (
135
+ text.replace(" -", "")
136
+ .replace(" ", "")
137
+ .replace("\n", " ")
138
+ .replace("- ", "")
139
+ .replace("`", "")
140
+ )
141
+ # text = text.replace(" ", "\n\n") # to keep second paragraph if it exists # sometime it's good to turn this on. but let's keep it off
142
+ text = text.replace(" ", " ") # off this if ^ is on
143
+
144
+ return text
145
+
146
+
147
+ # 1.3.1
148
+ def turn_to_points(text): # input must be from `turn_to_paragraph()`
149
+ # text = text.replace(". ", ".\n-") # to keep second paragraph if it exists
150
+ text_with_dashes = ".\n".join("- " + line.strip() for line in text.split(". "))
151
+ text_with_dashes = text_with_dashes.replace("\n\n", "\n\n- ") # for first sentence of new paragraph
152
+ return text_with_dashes
153
+
154
+ # 1.3.2 - combined funtions above for paragraph_or_points
155
+ def paragraph_or_points(text, pa_or_po):
156
+ if pa_or_po == "Points":
157
+ return turn_to_points(turn_to_paragraph(text))
158
+ else: # default is Paragraph
159
+ return turn_to_paragraph(text)
160
+
161
+ # 1.4 - summarization
162
+ def summarize_text(title, text, temperature, words, use_api=False, api_token=None, do_sample=False, length="Short", pa_or_po="Paragraph",):
163
+
164
+ from langchain.chains.llm import LLMChain
165
+ from langchain.prompts import PromptTemplate
166
+ from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
167
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
168
+ import torch
169
+ import transformers
170
+ from transformers import BitsAndBytesConfig
171
+ from transformers import AutoTokenizer, AutoModelForCausalLM
172
+
173
+ from langchain import HuggingFacePipeline
174
+ import torch
175
+
176
+ model_kwargs1 = {
177
+ "temperature": temperature,
178
+ "do_sample": do_sample,
179
+ "min_new_tokens": 200 - 25,
180
+ "max_new_tokens": 200 + 25,
181
+ "repetition_penalty": 20.0,
182
+ }
183
+ model_kwargs2 = {
184
+ "temperature": temperature,
185
+ "do_sample": do_sample,
186
+ "min_new_tokens": words,
187
+ "max_new_tokens": words + 100,
188
+ "repetition_penalty": 20.0,
189
+ }
190
+ if not do_sample:
191
+ del model_kwargs1["temperature"]
192
+ del model_kwargs2["temperature"]
193
+
194
+ if use_api:
195
+
196
+ from langchain import HuggingFaceHub
197
+
198
+ # os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
199
+ llm = HuggingFaceHub(
200
+ repo_id=llm_model_id,
201
+ model_kwargs=model_kwargs1,
202
+ huggingfacehub_api_token=api_token,
203
+ )
204
+ llm2 = HuggingFaceHub(
205
+ repo_id=llm_model_id,
206
+ model_kwargs=model_kwargs2,
207
+ huggingfacehub_api_token=api_token,
208
+ )
209
+ summary_source = (
210
+ "The summary was generated using {} via Hugging Face API.".format(
211
+ llm_model_id
212
+ )
213
+ )
214
+
215
+ else:
216
+ quantization_config = BitsAndBytesConfig(
217
+ load_in_4bit=True,
218
+ bnb_4bit_compute_dtype=torch.float16,
219
+ bnb_4bit_quant_type="nf4",
220
+ bnb_4bit_use_double_quant=True,
221
+ )
222
+
223
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
224
+ model = AutoModelForCausalLM.from_pretrained(
225
+ llm_model_id,
226
+ # quantization_config=quantization_config
227
+ )
228
+ model.to_bettertransformer()
229
+
230
+ pipeline = transformers.pipeline(
231
+ "text-generation",
232
+ model=model,
233
+ tokenizer=tokenizer,
234
+ torch_dtype=torch.bfloat16,
235
+ device_map="auto",
236
+ pad_token_id=tokenizer.eos_token_id,
237
+ **model_kwargs1,
238
+ )
239
+ pipeline2 = transformers.pipeline(
240
+ "text-generation",
241
+ model=model,
242
+ tokenizer=tokenizer,
243
+ torch_dtype=torch.bfloat16,
244
+ device_map="auto",
245
+ pad_token_id=tokenizer.eos_token_id,
246
+ **model_kwargs2,
247
+ )
248
+ llm = HuggingFacePipeline(pipeline=pipeline)
249
+ llm2 = HuggingFacePipeline(pipeline=pipeline2)
250
+
251
+ summary_source = "The summary was generated using {} hosted locally.".format(
252
+ llm_model_id
253
+ )
254
+
255
+ # Map
256
+ map_template = """
257
+ Summarize the following video in a clear way:\n
258
+ ----------------------- \n
259
+ TITLE: `{title}`\n
260
+ TEXT:\n
261
+ `{docs}`\n
262
+ ----------------------- \n
263
+ SUMMARY:\n
264
+ """
265
+ map_prompt = PromptTemplate(
266
+ template=map_template, input_variables=["title", "docs"]
267
+ )
268
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
269
+
270
+ # Reduce - Collapse
271
+ collapse_template = """
272
+ TITLE: `{title}`\n
273
+ TEXT:\n
274
+ `{doc_summaries}`\n
275
+ ----------------------- \n
276
+ Turn the text of a video above into a long essay:\n
277
+ """
278
+
279
+ collapse_prompt = PromptTemplate(
280
+ template=collapse_template, input_variables=["title", "doc_summaries"]
281
+ )
282
+ collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt) # LLM 1 <-- LLM
283
+
284
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
285
+ collapse_documents_chain = StuffDocumentsChain(
286
+ llm_chain=collapse_chain, document_variable_name="doc_summaries"
287
+ )
288
+
289
+ # Final Reduce - Combine
290
+ combine_template_short = """\n
291
+ TITLE: `{title}`\n
292
+ TEXT:\n
293
+ `{doc_summaries}`\n
294
+ ----------------------- \n
295
+ Turn the text of a video above into a 3-sentence summary:\n
296
+ """
297
+ combine_template_medium = """\n
298
+ TITLE: `{title}`\n
299
+ TEXT:\n
300
+ `{doc_summaries}`\n
301
+ ----------------------- \n
302
+ Turn the text of a video above into a long summary:\n
303
+ """
304
+ combine_template_long = """\n
305
+ TITLE: `{title}`\n
306
+ TEXT:\n
307
+ `{doc_summaries}`\n
308
+ ----------------------- \n
309
+ Turn the text of a video above into a long essay:\n
310
+ """
311
+ # Turn the text of a video above into a 3-sentence summary:\n
312
+ # Turn the text of a video above into a long summary:\n
313
+ # Turn the text of a video above into a long essay:\n
314
+ if length == "Medium":
315
+ combine_prompt = PromptTemplate(
316
+ template=combine_template_medium,
317
+ input_variables=["title", "doc_summaries", "words"],
318
+ )
319
+ elif length == "Long":
320
+ combine_prompt = PromptTemplate(
321
+ template=combine_template_long,
322
+ input_variables=["title", "doc_summaries", "words"],
323
+ )
324
+ else: # default is short
325
+ combine_prompt = PromptTemplate(
326
+ template=combine_template_short,
327
+ input_variables=["title", "doc_summaries", "words"],
328
+ )
329
+ combine_chain = LLMChain(llm=llm2, prompt=combine_prompt) # LLM 2 <-- LLM2
330
+
331
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
332
+ combine_documents_chain = StuffDocumentsChain(
333
+ llm_chain=combine_chain, document_variable_name="doc_summaries"
334
+ )
335
+
336
+ # Combines and iteratively reduces the mapped documents
337
+ reduce_documents_chain = ReduceDocumentsChain(
338
+ # This is final chain that is called.
339
+ combine_documents_chain=combine_documents_chain,
340
+ # If documents exceed context for `StuffDocumentsChain`
341
+ collapse_documents_chain=collapse_documents_chain,
342
+ # The maximum number of tokens to group documents into.
343
+ token_max=800,
344
+ )
345
+
346
+ # Combining documents by mapping a chain over them, then combining results
347
+ map_reduce_chain = MapReduceDocumentsChain(
348
+ # Map chain
349
+ llm_chain=map_chain,
350
+ # Reduce chain
351
+ reduce_documents_chain=reduce_documents_chain,
352
+ # The variable name in the llm_chain to put the documents in
353
+ document_variable_name="docs",
354
+ # Return the results of the map steps in the output
355
+ return_intermediate_steps=False,
356
+ )
357
+
358
+ from langchain.document_loaders import TextLoader
359
+ from langchain.text_splitter import TokenTextSplitter
360
+
361
+ with open(save_dir + "/transcript.txt", "w") as f:
362
+ f.write(text)
363
+ loader = TextLoader(save_dir + "/transcript.txt")
364
+ doc = loader.load()
365
+ text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100)
366
+ docs = text_splitter.split_documents(doc)
367
+
368
+ summary = map_reduce_chain.run(
369
+ {"input_documents": docs, "title": title, "words": words}
370
+ )
371
+
372
+ try:
373
+ del (map_reduce_chain, reduce_documents_chain,
374
+ combine_chain, collapse_documents_chain,
375
+ map_chain, collapse_chain,
376
+ llm, llm2,
377
+ pipeline, pipeline2,
378
+ model, tokenizer)
379
+ except:
380
+ pass
381
+ torch.cuda.empty_cache()
382
+
383
+ summary = paragraph_or_points(summary, pa_or_po)
384
+
385
+ return summary, summary_source
386
+
387
+
388
+ # 1.5 - complete function [DELETED]
389
+
390
+ # 2 - extractive/low-abstractive summary for Key Sentence Highlight
391
+ # 2.1 - chunking + hosted inference, summary [DELETED]
392
+
393
+ # 2.2 - add spaces between punctuations
394
+ import re
395
+ def add_space_before_punctuation(text):
396
+ # Define a regular expression pattern to match punctuation
397
+ punctuation_pattern = r"([.,!?;:])"
398
+
399
+ # Use re.sub to add a space before each punctuation
400
+ modified_text = re.sub(punctuation_pattern, r" \1", text)
401
+
402
+ return modified_text
403
+
404
+
405
+ # 2.3 - highlight same words (yellow)
406
+ from difflib import ndiff
407
+ def highlight_text_with_diff(text1, text2):
408
+ diff = list(ndiff(text1.split(), text2.split()))
409
+
410
+ highlighted_diff = []
411
+ for item in diff:
412
+ if item.startswith(" "):
413
+ highlighted_diff.append(
414
+ '<span style="background-color: rgba(255, 255, 0, 0.25);">'
415
+ + item
416
+ + " </span>"
417
+ ) # Unchanged words
418
+ elif item.startswith("+"):
419
+ highlighted_diff.append(item[2:] + " ")
420
+
421
+ return "".join(highlighted_diff) # output in string HTML format
422
+
423
+ # 2.4 - combined - `highlight_key_sentences`
424
+ # extractive/low-abstractive summarizer with facebook/bart-large-cnn
425
+ # highlight feature
426
+ def highlight_key_sentences(original_text, api_key):
427
+
428
+ import requests
429
+
430
+ API_TOKEN = api_key
431
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
432
+ API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
433
+
434
+ def query(payload):
435
+ response = requests.post(API_URL, headers=headers, json=payload)
436
+ return response.json()
437
+
438
+ def chunk_text(text, chunk_size=1024):
439
+ # Split the text into chunks
440
+ chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
441
+ return chunks
442
+
443
+ def summarize_long_text(long_text):
444
+ # Split the long text into chunks
445
+ text_chunks = chunk_text(long_text)
446
+
447
+ # Summarize each chunk
448
+ summaries = []
449
+ for chunk in text_chunks:
450
+ data = query(
451
+ {
452
+ "inputs": f"{chunk}",
453
+ "parameters": {"do_sample": False},
454
+ }
455
+ ) # what if do_sample=True?
456
+ summaries.append(data[0]["summary_text"])
457
+
458
+ # Combine the summaries of all chunks
459
+ full_summary = " ".join(summaries)
460
+ return full_summary
461
+
462
+ summarized_text = summarize_long_text(original_text)
463
+
464
+ original_text = add_space_before_punctuation(original_text)
465
+ summarized_text = add_space_before_punctuation(summarized_text)
466
+
467
+ return highlight_text_with_diff(summarized_text, original_text) # output in string HTML format
468
+
469
+
470
+ # 3 - extract_keywords
471
+ # 3.1 - initialize & load pipeline
472
+ from transformers import (
473
+ TokenClassificationPipeline,
474
+ AutoModelForTokenClassification,
475
+ AutoTokenizer,
476
+ )
477
+ from transformers.pipelines import AggregationStrategy
478
+ import numpy as np
479
+
480
+ # Define keyphrase extraction pipeline
481
+ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
482
+ def __init__(self, model, *args, **kwargs):
483
+ super().__init__(
484
+ model=AutoModelForTokenClassification.from_pretrained(model),
485
+ tokenizer=AutoTokenizer.from_pretrained(model),
486
+ *args,
487
+ **kwargs,
488
+ )
489
+
490
+ def postprocess(self, all_outputs):
491
+ results = super().postprocess(
492
+ all_outputs=all_outputs,
493
+ aggregation_strategy=AggregationStrategy.SIMPLE,
494
+ )
495
+ return np.unique([result.get("word").strip() for result in results])
496
+
497
+
498
+ # Load pipeline
499
+ model_name = "ml6team/keyphrase-extraction-kbir-inspec"
500
+ extractor = KeyphraseExtractionPipeline(model=model_name)
501
+
502
+ # 3.2 - re-arrange keywords order
503
+ import re
504
+ def rearrange_keywords(text, keywords): # text:str, keywords:List
505
+ # Find the positions of each keyword in the text
506
+ keyword_positions = {word: text.lower().index(word.lower()) for word in keywords}
507
+
508
+ # Sort the keywords based on their positions in the text
509
+ sorted_keywords = sorted(keywords, key=lambda x: keyword_positions[x])
510
+
511
+ return sorted_keywords
512
+
513
+ # 3.3 - `keywords_extractor` function
514
+ def keywords_extractor_list(summary): # List : Flashcards
515
+ keyphrases = extractor(summary) # extractor() from above | text.replace("\n", " ")
516
+ list_keyphrases = keyphrases.tolist()
517
+
518
+ # rearrange first
519
+ list_keyphrases = rearrange_keywords(summary, list_keyphrases)
520
+
521
+ return list_keyphrases # returns List
522
+
523
+ def keywords_extractor_str(summary): # str : Keywords Highlight & Fill in the Blank
524
+ keyphrases = extractor(summary) # extractor() from above | text.replace("\n", " ")
525
+ list_keyphrases = keyphrases.tolist()
526
+
527
+ # rearrange first
528
+ list_keyphrases = rearrange_keywords(summary, list_keyphrases)
529
+
530
+ # join List elements to one string
531
+ all_keyphrases = " ".join(list_keyphrases)
532
+
533
+ return all_keyphrases # returns one string
534
+
535
+ # 3.4 - keywords highlight
536
+ # 3.4.1 - highlight same words (green)
537
+ def highlight_green(text1, text2): # keywords(str), text
538
+ diff = list(ndiff(text1.split(), text2.split()))
539
+
540
+ highlighted_diff = []
541
+ for item in diff:
542
+ if item.startswith(" "):
543
+ highlighted_diff.append(
544
+ '<span style="background-color: rgba(0, 255, 0, 0.25);">'
545
+ + item
546
+ + " </span>"
547
+ ) # Unchanged words
548
+ elif item.startswith("+"):
549
+ highlighted_diff.append(item[2:] + " ")
550
+
551
+ return "".join(highlighted_diff) # output in string HTML format
552
+
553
+
554
+ # 3.4.2 - combined - keywords highlight
555
+ def keywords_highlight(text):
556
+ keywords = keywords_extractor_str(text) # keywords; one string
557
+ text = add_space_before_punctuation(text)
558
+ return highlight_green(keywords, text) # output in string HTML format
559
+
560
+
561
+ # 3.5 - flashcards
562
+ # 3.5.1 - pair_keywords_sentences
563
+ def pair_keywords_sentences(text, search_words): # text:str, search_words:List
564
+
565
+ result_html = "<span style='text-align: center;'>"
566
+
567
+ # Split the text into sentences
568
+ sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
569
+
570
+ # Create a dictionary to store sentences for each keyword
571
+ keyword_sentences = {word: [] for word in search_words}
572
+
573
+ # Iterate through sentences and search for keywords
574
+ for sentence in sentences:
575
+ for word in search_words:
576
+ if re.search(
577
+ r"\b{}\b".format(re.escape(word)), sentence, flags=re.IGNORECASE
578
+ ):
579
+ keyword_sentences[word].append(sentence)
580
+
581
+ # Print the results
582
+ for word, sentences in keyword_sentences.items():
583
+ result_html += "<h2>" + word + "</h2> \n"
584
+
585
+ for sentence in sentences:
586
+ result_html += "<p>" + sentence + "</p> \n"
587
+
588
+ result_html += "\n"
589
+
590
+ result_html += "</span>"
591
+
592
+ return result_html
593
+
594
+ # 3.5.2 combined - flashcards
595
+ def flashcards(text):
596
+ keywords = keywords_extractor_list(text) # keywords; a List
597
+ text = add_space_before_punctuation(text)
598
+ return pair_keywords_sentences(text, keywords) # output in string HTML format
599
+
600
+
601
+ # 3.6 - fill in the blank
602
+ # 3.6.1 - underline same words
603
+ def underline_keywords(text1, text2): # keywords(str), text
604
+ diff = list(ndiff(text1.split(), text2.split()))
605
+
606
+ highlighted_diff = []
607
+ for item in diff:
608
+ if item.startswith(" "):
609
+ highlighted_diff.append(
610
+ "_______"
611
+ ) # Unchanged words. make length independent of word length?
612
+ elif item.startswith("+"):
613
+ highlighted_diff.append(item[2:] + " ")
614
+
615
+ return "".join(highlighted_diff) # output in string HTML format
616
+
617
+
618
+ # 3.6.2 - combined - underline
619
+ def fill_in_blanks(text):
620
+ keywords = keywords_extractor_str(text) # keywords; one string
621
+ text = add_space_before_punctuation(text)
622
+ return underline_keywords(keywords, text) # output in string HTML format
623
+
624
+
625
+ # 4 - misc
626
+ emptyTabHTML = "<br>\n<p style='color: gray; text-align: center'>Please generate a summary first.</p>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n"
627
+
628
+
629
+ def empty_tab():
630
+ return emptyTabHTML
631
+
632
+
633
+ # 5 - the app
634
+ import gradio as gr
635
+
636
+ with gr.Blocks() as demo:
637
+ gr.Markdown("<br>")
638
+
639
+ with gr.Row():
640
+ with gr.Column():
641
+ gr.Markdown("# ✍️ Summarizer for Learning")
642
+ with gr.Column():
643
+ gr.HTML("<div style='color: red; text-align: right'>Please use your <a href='#HFAPI' style='color: red'>Hugging Face Access Token.</a></div>")
644
+
645
+ with gr.Row():
646
+ with gr.Column():
647
+ with gr.Tab("YouTube"):
648
+ yt_link = gr.Textbox(show_label=False, placeholder="Insert YouTube link here ...")
649
+ yt_transcript = gr.Textbox(show_label=False, placeholder="Transcript will be shown here ...", lines=12)
650
+ with gr.Tab("Article"):
651
+ gr.Textbox(show_label=False, placeholder="WORK IN PROGRESS", interactive=False)
652
+ gr.Textbox(show_label=False, placeholder="", lines=12, interactive=False)
653
+ with gr.Tab("Text"):
654
+ gr.Dropdown(["WORK IN PROGRESS", "Example 2"], show_label=False, value="WORK IN PROGRESS", interactive=False)
655
+ gr.Textbox(show_label=False, placeholder="", lines=12, interactive=False)
656
+ with gr.Row():
657
+ clrButton = gr.ClearButton([yt_link, yt_transcript])
658
+ subButton = gr.Button(variant="primary", value="Summarize")
659
+
660
+ with gr.Accordion("Settings", open=False):
661
+ length = gr.Radio(["Short", "Medium", "Long"], label="Length", value="Short", interactive=True)
662
+ pa_or_po = gr.Radio(["Paragraphs", "Points"], label="Summarize to", value="Paragraphs", interactive=True)
663
+ gr.Checkbox(label="Add headings", interactive=False)
664
+ gr.Radio(["One section", "Few sections"], label="Summarize into", interactive=False) # info="Only for 'Medium' or 'Long'"
665
+ with gr.Row():
666
+ clrButtonSt1 = gr.ClearButton([length, pa_or_po], interactive=True)
667
+ subButtonSt1 = gr.Button(value="Set Current as Default", interactive=False)
668
+ subButtonSt1 = gr.Button(value="Show Default", interactive=False)
669
+
670
+ with gr.Accordion("Advanced Settings", open=False):
671
+ with gr.Group():
672
+ gr.HTML("<p style='text-align: center;'>&nbsp; YouTube transcription</p>")
673
+ force_transcribe_with_app = gr.Checkbox(
674
+ label="Always transcribe with app",
675
+ info="The app first checks if caption on YouTube is available. If ticked, the app will transcribe the video for you but slower.",
676
+ )
677
+ with gr.Group():
678
+ gr.HTML("<p style='text-align: center;'>&nbsp; Summarization</p>")
679
+ gr.Radio(["High Abstractive", "Low Abstractive", "Extractive"], label="Type of summarization", value="High Abstractive", interactive=False)
680
+ gr.Dropdown(
681
+ [
682
+ "tiiuae/falcon-7b-instruct",
683
+ "GPT2 (work in progress)",
684
+ "OpenChat 3.5 (work in progress)",
685
+ ],
686
+ label="Model",
687
+ value="tiiuae/falcon-7b-instruct",
688
+ interactive=False,
689
+ )
690
+ temperature = gr.Slider(0.10, 0.30, step=0.05, label="Temperature", value=0.15,
691
+ info="Temperature is limited to 0.1 ~ 0.3 window, as it is shown to produce best result.",
692
+ interactive=True,
693
+ )
694
+ do_sample = gr.Checkbox(label="do_sample", value=True,
695
+ info="If ticked, do_sample produces more creative and diverse text, otherwise the app will use greedy decoding that generate more consistent and predictable summary.",
696
+ )
697
+
698
+ with gr.Group():
699
+ gr.HTML("<p style='text-align: center;'>&nbsp; Highlight</p>")
700
+ check_key_sen = gr.Checkbox(label="Highlight key sentences", info="In original text", value=True, interactive=False)
701
+ gr.Checkbox(label="Highlight keywords", info="In summary", value=True, interactive=False)
702
+ gr.Checkbox(label="Turn text to paragraphs", interactive=False)
703
+
704
+ with gr.Group():
705
+ gr.HTML("<p style='text-align: center;'>&nbsp; Quiz mode</p>")
706
+ gr.Checkbox(label="Fill in the blanks", value=True, interactive=False)
707
+ gr.Checkbox(label="Flashcards", value=True, interactive=False)
708
+ gr.Checkbox(label="Re-write summary", interactive=False) # info="Only for 'Short'"
709
+
710
+ with gr.Row():
711
+ clrButtonSt2 = gr.ClearButton(interactive=True)
712
+ subButtonSt2 = gr.Button(value="Set Current as Default", interactive=False)
713
+ subButtonSt2 = gr.Button(value="Show Default", interactive=False)
714
+
715
+ with gr.Column():
716
+ with gr.Tab("Summary"): # Output
717
+ title = gr.Textbox(show_label=False, placeholder="Title")
718
+ summary = gr.Textbox(lines=11, show_copy_button=True, label="", placeholder="Summarized output ...")
719
+ with gr.Tab("Key sentences", render=True):
720
+ key_sentences = gr.HTML(emptyTabHTML)
721
+ showButtonKeySen = gr.Button(value="Generate")
722
+ with gr.Tab("Keywords", render=True):
723
+ keywords = gr.HTML(emptyTabHTML)
724
+ showButtonKeyWor = gr.Button(value="Generate")
725
+ with gr.Tab("Fill in the blank", render=True):
726
+ blanks = gr.HTML(emptyTabHTML)
727
+ showButtonFilBla = gr.Button(value="Generate")
728
+ with gr.Tab("Flashcards", render=True):
729
+ flashCrd = gr.HTML(emptyTabHTML)
730
+ showButtonFlash = gr.Button(value="Generate")
731
+ gr.Markdown("<span style='color: gray'> The app is still a work in progress. The output may be odd and some features are still disabled. [Learn more]().</span>")
732
+ with gr.Group():
733
+ gr.HTML("<p id='HFAPI' style='text-align: center;'>&nbsp; 🤗 Hugging Face Access Token [<a href='https://huggingface.co/docs/hub/security-tokens'>more</a>]</p>")
734
+ hf_access_token = gr.Textbox(
735
+ show_label=False,
736
+ placeholder="example: hf_******************************",
737
+ type="password",
738
+ info="The app does not store the token.",
739
+ )
740
+ with gr.Accordion("Info", open=False, visible=False):
741
+ transcript_source = gr.Textbox(show_label=False, placeholder="transcript_source")
742
+ summary_source = gr.Textbox(show_label=False, placeholder="summary_source")
743
+ words = gr.Slider(minimum=100, maximum=500, value=250, label="Length of the summary")
744
+ # words: what should be the constant value?
745
+ use_api = gr.Checkbox(label="use_api", value=True)
746
+
747
+ subButton.click(
748
+ fn=transcribe_youtube_video,
749
+ inputs=[yt_link, force_transcribe_with_app, use_api, hf_access_token],
750
+ outputs=[title, yt_transcript, transcript_source],
751
+ queue=True,
752
+ ).then(
753
+ fn=summarize_text,
754
+ inputs=[title, yt_transcript, temperature, words, use_api, hf_access_token, do_sample, length, pa_or_po],
755
+ outputs=[summary, summary_source],
756
+ api_name="summarize_text",
757
+ queue=True,
758
+ )
759
+
760
+ subButton.click(fn=empty_tab, outputs=[key_sentences])
761
+ subButton.click(fn=empty_tab, outputs=[keywords])
762
+ subButton.click(fn=empty_tab, outputs=[flashCrd])
763
+ subButton.click(fn=empty_tab, outputs=[blanks])
764
+
765
+ showButtonKeySen.click(
766
+ fn=highlight_key_sentences,
767
+ inputs=[yt_transcript, hf_access_token],
768
+ outputs=[key_sentences],
769
+ queue=True,
770
+ )
771
+
772
+ # Keywords
773
+ showButtonKeyWor.click(fn=keywords_highlight, inputs=[summary], outputs=[keywords], queue=True)
774
+
775
+ # Flashcards
776
+ showButtonFlash.click(fn=flashcards, inputs=[summary], outputs=[flashCrd], queue=True)
777
+
778
+ # Fill in the blanks
779
+ showButtonFilBla.click(fn=fill_in_blanks, inputs=[summary], outputs=[blanks], queue=True)
780
+
781
+ if __name__ == "__main__":
782
+ demo.launch(show_api=False)
783
+ # demo.launch(show_api=False, debug=True)
784
+ # demo.launch(show_api=False, share=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ git+https://github.com/openai/whisper.git
4
+ youtube-transcript-api
5
+ langchain
6
+ numpy
7
+ pytube