Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
from typing import Union, AnyStr
|
4 |
from urllib.parse import urlparse, parse_qs
|
5 |
import textwrap
|
@@ -17,6 +18,23 @@ import torch.nn.functional as F
|
|
17 |
client = OpenAI(
|
18 |
api_key='sk-proj-RzzAiW2c6iPtPOuACspxT3BlbkFJxuwC04BRyCcEnNKjcC6Z'
|
19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
|
22 |
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
@@ -261,6 +279,7 @@ def summarize_youtube_video(youtube_url, outputs_dir):
|
|
261 |
audio_filename, segment_length=segment_length, output_dir=chunks_dir
|
262 |
)
|
263 |
en_transcript = transcribe_audio(chunked_audio_files)
|
|
|
264 |
vi_transcript = translate_text(en_transcript)
|
265 |
summ_en = summary(en_transcript, 'en')
|
266 |
summ_vi = summary(vi_transcript, 'vi')
|
|
|
1 |
import os
|
2 |
import time
|
3 |
+
import re
|
4 |
from typing import Union, AnyStr
|
5 |
from urllib.parse import urlparse, parse_qs
|
6 |
import textwrap
|
|
|
18 |
client = OpenAI(
|
19 |
api_key='sk-proj-RzzAiW2c6iPtPOuACspxT3BlbkFJxuwC04BRyCcEnNKjcC6Z'
|
20 |
)
|
21 |
+
def cleaning_input(input_text):
|
22 |
+
from html import unescape
|
23 |
+
text = str(input_text)
|
24 |
+
text = re.sub(r'\n\s*\n', '\n', text)
|
25 |
+
text = re.sub(r'[ ]+', ' ', text)
|
26 |
+
text = re.sub(r'\.{2,}', '.', text)
|
27 |
+
text = re.sub(r',{2,}', ',', text)
|
28 |
+
text = re.sub(r'-{2,}', '-', text)
|
29 |
+
text = re.sub(r'_{2,}', '_', text)
|
30 |
+
text = re.sub(r'!{2,}', '!', text)
|
31 |
+
text = re.sub(r'\?{2,}', '?', text)
|
32 |
+
text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)
|
33 |
+
text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)
|
34 |
+
text = unescape(text)
|
35 |
+
text = re.sub(r'[^\w\s\[\]\(\)\$\\.\n\/:#<>{},_"!@\\-\\*=\\]', '', text)
|
36 |
+
text = re.sub(r'\s+', ' ', text)
|
37 |
+
return text
|
38 |
|
39 |
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
|
40 |
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
|
|
279 |
audio_filename, segment_length=segment_length, output_dir=chunks_dir
|
280 |
)
|
281 |
en_transcript = transcribe_audio(chunked_audio_files)
|
282 |
+
en_transcript = cleaning_input(en_transcript)
|
283 |
vi_transcript = translate_text(en_transcript)
|
284 |
summ_en = summary(en_transcript, 'en')
|
285 |
summ_vi = summary(vi_transcript, 'vi')
|