tiennlu commited on
Commit
e2db9fa
·
verified ·
1 Parent(s): 8b5e299

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import time
 
3
  from typing import Union, AnyStr
4
  from urllib.parse import urlparse, parse_qs
5
  import textwrap
@@ -17,6 +18,23 @@ import torch.nn.functional as F
17
  client = OpenAI(
18
  api_key='sk-proj-RzzAiW2c6iPtPOuACspxT3BlbkFJxuwC04BRyCcEnNKjcC6Z'
19
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
22
  """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
@@ -261,6 +279,7 @@ def summarize_youtube_video(youtube_url, outputs_dir):
261
  audio_filename, segment_length=segment_length, output_dir=chunks_dir
262
  )
263
  en_transcript = transcribe_audio(chunked_audio_files)
 
264
  vi_transcript = translate_text(en_transcript)
265
  summ_en = summary(en_transcript, 'en')
266
  summ_vi = summary(vi_transcript, 'vi')
 
1
  import os
2
  import time
3
+ import re
4
  from typing import Union, AnyStr
5
  from urllib.parse import urlparse, parse_qs
6
  import textwrap
 
18
  client = OpenAI(
19
  api_key='sk-proj-RzzAiW2c6iPtPOuACspxT3BlbkFJxuwC04BRyCcEnNKjcC6Z'
20
  )
21
+ def cleaning_input(input_text):
22
+ from html import unescape
23
+ text = str(input_text)
24
+ text = re.sub(r'\n\s*\n', '\n', text)
25
+ text = re.sub(r'[ ]+', ' ', text)
26
+ text = re.sub(r'\.{2,}', '.', text)
27
+ text = re.sub(r',{2,}', ',', text)
28
+ text = re.sub(r'-{2,}', '-', text)
29
+ text = re.sub(r'_{2,}', '_', text)
30
+ text = re.sub(r'!{2,}', '!', text)
31
+ text = re.sub(r'\?{2,}', '?', text)
32
+ text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)
33
+ text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)
34
+ text = unescape(text)
35
+ text = re.sub(r'[^\w\s\[\]\(\)\$\\.\n\/:#<>{},_"!@\\-\\*=\\]', '', text)
36
+ text = re.sub(r'\s+', ' ', text)
37
+ return text
38
 
39
  def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
40
  """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
 
279
  audio_filename, segment_length=segment_length, output_dir=chunks_dir
280
  )
281
  en_transcript = transcribe_audio(chunked_audio_files)
282
+ en_transcript = cleaning_input(en_transcript)
283
  vi_transcript = translate_text(en_transcript)
284
  summ_en = summary(en_transcript, 'en')
285
  summ_vi = summary(vi_transcript, 'vi')