pham thuy tien commited on
Commit
e40fde0
·
verified ·
1 Parent(s): db605b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -277,19 +277,19 @@ def get_transcript(video_id):
277
 
278
  words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
279
 
280
-
281
  for t in transcript_data:
282
- if t['text'].lower().strip() not in words_to_remove:
 
283
  print()
284
  print(t['text'])
285
  print()
286
  tran.append(t['text'])
287
 
288
-
289
  return ' '.join(tran)
290
 
291
 
292
 
 
293
  def chunk_text(text, chunk_size=1000, overlap_size=24):
294
  encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
295
  chunk_overlap=overlap_size)
 
277
 
278
  words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
279
 
 
280
  for t in transcript_data:
281
+ text = t['text'].lower().strip()
282
+ if not any(word in text for word in words_to_remove):
283
  print()
284
  print(t['text'])
285
  print()
286
  tran.append(t['text'])
287
 
 
288
  return ' '.join(tran)
289
 
290
 
291
 
292
+
293
  def chunk_text(text, chunk_size=1000, overlap_size=24):
294
  encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
295
  chunk_overlap=overlap_size)