Update utils.py
Browse files
utils.py
CHANGED
@@ -329,10 +329,24 @@ def document_loading_splitting():
|
|
329 |
# Load YouTube
|
330 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
331 |
#docs.extend(loader.load())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
################################
|
333 |
# Document splitting
|
334 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
335 |
-
splits = text_splitter.split_documents(
|
336 |
print("Splits...........................")
|
337 |
for split in splits:
|
338 |
if 'DIVIS' in split.page_content:
|
|
|
329 |
# Load YouTube
|
330 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
331 |
#docs.extend(loader.load())
|
332 |
+
|
333 |
+
|
334 |
+
# Vorverarbeitung der Dokumente - passend zu dem der Prompts...
|
335 |
+
preprocessed_docs = []
|
336 |
+
for doc in docs:
|
337 |
+
preprocessed_content = preprocess_text(doc.page_content)
|
338 |
+
preprocessed_title = preprocess_text(doc.metadata["title"])
|
339 |
+
preprocessed_metadata = {
|
340 |
+
"title": preprocessed_title,
|
341 |
+
"page": doc.metadata["page"],
|
342 |
+
"path": doc.metadata["path"]
|
343 |
+
}
|
344 |
+
preprocessed_docs.append(Document(metadata=preprocessed_metadata, page_content=preprocessed_content))
|
345 |
+
|
346 |
################################
|
347 |
# Document splitting
|
348 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
349 |
+
splits = text_splitter.split_documents(preprocessed_docs)
|
350 |
print("Splits...........................")
|
351 |
for split in splits:
|
352 |
if 'DIVIS' in split.page_content:
|