Update utils.py
Browse files
utils.py
CHANGED
@@ -302,6 +302,17 @@ def load_word_with_metadata(file_path):
|
|
302 |
|
303 |
################################################
|
304 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
def document_loading_splitting():
|
306 |
##############################
|
307 |
# Document loading
|
@@ -335,6 +346,7 @@ def document_loading_splitting():
|
|
335 |
# Vorverarbeitung der Dokumente
|
336 |
preprocessed_docs = []
|
337 |
original_docs = []
|
|
|
338 |
for doc in docs:
|
339 |
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
340 |
preprocessed_content = preprocess_text(doc.page_content)
|
@@ -342,7 +354,8 @@ def document_loading_splitting():
|
|
342 |
preprocessed_metadata = {
|
343 |
"title": preprocessed_title,
|
344 |
"page": doc.metadata["page"],
|
345 |
-
"path": doc.metadata["path"]
|
|
|
346 |
}
|
347 |
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
348 |
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
@@ -354,12 +367,13 @@ def document_loading_splitting():
|
|
354 |
# Document splitting
|
355 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
356 |
splits = text_splitter.split_documents(preprocessed_docs)
|
|
|
357 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
358 |
-
original_splits =
|
359 |
-
preprocessed_splits =
|
360 |
|
361 |
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
362 |
-
split_to_original_mapping = {p_split.
|
363 |
|
364 |
|
365 |
print("Splits...........................")
|
@@ -720,11 +734,9 @@ class Document:
|
|
720 |
self.metadata = {
|
721 |
"title": title,
|
722 |
"page": page,
|
723 |
-
"path": path
|
|
|
724 |
}
|
725 |
-
self.doc_id = doc_id
|
726 |
-
|
727 |
-
|
728 |
|
729 |
|
730 |
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
|
|
|
302 |
|
303 |
################################################
|
304 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
305 |
+
# Funktion zum Splitten und Zuweisen der doc_id
|
306 |
+
def split_documents_with_id(docs):
|
307 |
+
splits = []
|
308 |
+
for doc in docs:
|
309 |
+
doc_splits = text_splitter.split_text(doc.page_content)
|
310 |
+
for split_content in doc_splits:
|
311 |
+
split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc.doc_id)
|
312 |
+
splits.append(split_doc)
|
313 |
+
return splits
|
314 |
+
|
315 |
+
#finally die Splits erzeugen und laden.....
|
316 |
def document_loading_splitting():
|
317 |
##############################
|
318 |
# Document loading
|
|
|
346 |
# Vorverarbeitung der Dokumente
|
347 |
preprocessed_docs = []
|
348 |
original_docs = []
|
349 |
+
|
350 |
for doc in docs:
|
351 |
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
352 |
preprocessed_content = preprocess_text(doc.page_content)
|
|
|
354 |
preprocessed_metadata = {
|
355 |
"title": preprocessed_title,
|
356 |
"page": doc.metadata["page"],
|
357 |
+
"path": doc.metadata["path"],
|
358 |
+
"doc_id": doc_id # Füge die ID in die Metadaten ein
|
359 |
}
|
360 |
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
361 |
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
|
|
367 |
# Document splitting
|
368 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
369 |
splits = text_splitter.split_documents(preprocessed_docs)
|
370 |
+
|
371 |
# Split sowohl für originale als auch für vorverarbeitete Dokumente
|
372 |
+
original_splits = split_documents_with_id(original_docs)
|
373 |
+
preprocessed_splits = split_documents_with_id(preprocessed_docs)
|
374 |
|
375 |
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
|
376 |
+
split_to_original_mapping = {p_split.metadata["doc_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
377 |
|
378 |
|
379 |
print("Splits...........................")
|
|
|
734 |
self.metadata = {
|
735 |
"title": title,
|
736 |
"page": page,
|
737 |
+
"path": path,
|
738 |
+
"doc_id": doc_id # Füge die ID in die Metadaten ein
|
739 |
}
|
|
|
|
|
|
|
740 |
|
741 |
|
742 |
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
|