Spaces:
Running
Running
pham thuy tien
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -277,19 +277,19 @@ def get_transcript(video_id):
|
|
277 |
|
278 |
words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
|
279 |
|
280 |
-
|
281 |
for t in transcript_data:
|
282 |
-
|
|
|
283 |
print()
|
284 |
print(t['text'])
|
285 |
print()
|
286 |
tran.append(t['text'])
|
287 |
|
288 |
-
|
289 |
return ' '.join(tran)
|
290 |
|
291 |
|
292 |
|
|
|
293 |
def chunk_text(text, chunk_size=1000, overlap_size=24):
|
294 |
encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
|
295 |
chunk_overlap=overlap_size)
|
|
|
277 |
|
278 |
words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
|
279 |
|
|
|
280 |
for t in transcript_data:
|
281 |
+
text = t['text'].lower().strip()
|
282 |
+
if not any(word in text for word in words_to_remove):
|
283 |
print()
|
284 |
print(t['text'])
|
285 |
print()
|
286 |
tran.append(t['text'])
|
287 |
|
|
|
288 |
return ' '.join(tran)
|
289 |
|
290 |
|
291 |
|
292 |
+
|
293 |
def chunk_text(text, chunk_size=1000, overlap_size=24):
|
294 |
encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
|
295 |
chunk_overlap=overlap_size)
|