Update utils.py
Browse files
utils.py
CHANGED
@@ -341,19 +341,24 @@ def document_loading_splitting():
|
|
341 |
"page": doc.metadata["page"],
|
342 |
"path": doc.metadata["path"]
|
343 |
}
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
################################
|
348 |
# Document splitting
|
349 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
350 |
splits = text_splitter.split_documents(preprocessed_docs)
|
351 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
352 |
-
original_splits = text_splitter.split_documents(
|
353 |
preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
|
354 |
|
355 |
-
# Mapping von vorverarbeiteten Splits zu Originalsplits
|
356 |
-
split_to_original_mapping = {p_split: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
357 |
|
358 |
|
359 |
print("Splits...........................")
|
|
|
341 |
"page": doc.metadata["page"],
|
342 |
"path": doc.metadata["path"]
|
343 |
}
|
344 |
+
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"])
|
345 |
+
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"])
|
346 |
+
# Weise IDs zu
|
347 |
+
preprocessed_doc.id = str(uuid.uuid4())
|
348 |
+
original_doc.id = preprocessed_doc.id
|
349 |
+
preprocessed_docs.append(preprocessed_doc)
|
350 |
+
original_docs.append(original_doc)
|
351 |
|
352 |
################################
|
353 |
# Document splitting
|
354 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
355 |
splits = text_splitter.split_documents(preprocessed_docs)
|
356 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
357 |
+
original_splits = text_splitter.split_documents(original_docs)
|
358 |
preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
|
359 |
|
360 |
+
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
361 |
+
split_to_original_mapping = {p_split.id: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
362 |
|
363 |
|
364 |
print("Splits...........................")
|