Spaces:

huudan12345
/

tl

Running

pham thuy tien commited on Jun 27, 2024

Commit

e40fde0

verified ·

1 Parent(s): db605b5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -277,19 +277,19 @@ def get_transcript(video_id):
     words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
     for t in transcript_data:
-        if t['text'].lower().strip() not in words_to_remove:
             print()
             print(t['text'])
             print()
             tran.append(t['text'])
     return ' '.join(tran)
 def chunk_text(text, chunk_size=1000, overlap_size=24):
     encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
                                                                      chunk_overlap=overlap_size)

     words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
     for t in transcript_data:
+        text = t['text'].lower().strip()
+        if not any(word in text for word in words_to_remove):
             print()
             print(t['text'])
             print()
             tran.append(t['text'])
     return ' '.join(tran)
 def chunk_text(text, chunk_size=1000, overlap_size=24):
     encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
                                                                      chunk_overlap=overlap_size)