Spaces:

tiennlu
/

tl

Sleeping

tiennlu commited on Jun 13, 2024

Commit

e2db9fa

verified ·

1 Parent(s): 8b5e299

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import time
 from typing import Union, AnyStr
 from urllib.parse import urlparse, parse_qs
 import textwrap
@@ -17,6 +18,23 @@ import torch.nn.functional as F
 client = OpenAI(
     api_key='sk-proj-RzzAiW2c6iPtPOuACspxT3BlbkFJxuwC04BRyCcEnNKjcC6Z'
 )
 def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
@@ -261,6 +279,7 @@ def summarize_youtube_video(youtube_url, outputs_dir):
             audio_filename, segment_length=segment_length, output_dir=chunks_dir
         )
         en_transcript = transcribe_audio(chunked_audio_files)
     vi_transcript = translate_text(en_transcript)
     summ_en = summary(en_transcript, 'en')
     summ_vi = summary(vi_transcript, 'vi')

 import os
 import time
+import re
 from typing import Union, AnyStr
 from urllib.parse import urlparse, parse_qs
 import textwrap
 client = OpenAI(
     api_key='sk-proj-RzzAiW2c6iPtPOuACspxT3BlbkFJxuwC04BRyCcEnNKjcC6Z'
 )
+def cleaning_input(input_text):
+    from html import unescape
+    text = str(input_text)
+    text = re.sub(r'\n\s*\n', '\n', text)
+    text = re.sub(r'[ ]+', ' ', text)
+    text = re.sub(r'\.{2,}', '.', text)
+    text = re.sub(r',{2,}', ',', text)
+    text = re.sub(r'-{2,}', '-', text)
+    text = re.sub(r'_{2,}', '_', text)
+    text = re.sub(r'!{2,}', '!', text)
+    text = re.sub(r'\?{2,}', '?', text)
+    text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)
+    text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)
+    text = unescape(text)
+    text = re.sub(r'[^\w\s\[\]\(\)\$\\.\n\/:#<>{},_"!@\\-\\*=\\]', '', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text
 def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
             audio_filename, segment_length=segment_length, output_dir=chunks_dir
         )
         en_transcript = transcribe_audio(chunked_audio_files)
+    en_transcript = cleaning_input(en_transcript)
     vi_transcript = translate_text(en_transcript)
     summ_en = summary(en_transcript, 'en')
     summ_vi = summary(vi_transcript, 'vi')