Update utils.py
Browse files
utils.py
CHANGED
@@ -306,9 +306,10 @@ def load_word_with_metadata(file_path):
|
|
306 |
def split_documents_with_id(docs, text_splitter):
|
307 |
splits = []
|
308 |
for doc in docs:
|
309 |
-
doc_splits = text_splitter.split_text(doc.page_content)
|
310 |
for split_content in doc_splits:
|
311 |
-
|
|
|
312 |
splits.append(split_doc)
|
313 |
return splits
|
314 |
|
@@ -342,38 +343,26 @@ def document_loading_splitting():
|
|
342 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
343 |
#docs.extend(loader.load())
|
344 |
|
345 |
-
|
346 |
-
# Vorverarbeitung der Dokumente
|
347 |
-
preprocessed_docs = []
|
348 |
-
original_docs = []
|
349 |
-
|
350 |
-
for doc in docs:
|
351 |
-
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
352 |
-
preprocessed_content = preprocess_text(doc.page_content)
|
353 |
-
preprocessed_title = preprocess_text(doc.metadata["title"])
|
354 |
-
preprocessed_metadata = {
|
355 |
-
"title": preprocessed_title,
|
356 |
-
"page": doc.metadata["page"],
|
357 |
-
"path": doc.metadata["path"],
|
358 |
-
"doc_id": doc_id # Füge die ID in die Metadaten ein
|
359 |
-
}
|
360 |
-
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
361 |
-
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
362 |
-
preprocessed_docs.append(preprocessed_doc)
|
363 |
-
original_docs.append(original_doc)
|
364 |
-
print("orgin doc....................................."+str(original_doc))
|
365 |
|
366 |
################################
|
367 |
# Document splitting
|
368 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
369 |
#splits = text_splitter.split_documents(preprocessed_docs)
|
370 |
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
-
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der
|
376 |
-
split_to_original_mapping = {p_split.metadata["
|
377 |
|
378 |
|
379 |
print("Splits...........................")
|
@@ -381,7 +370,7 @@ def document_loading_splitting():
|
|
381 |
if 'divis' in split.page_content:
|
382 |
print("DIVIS found in chunk:", split)
|
383 |
|
384 |
-
return preprocessed_splits, split_to_original_mapping
|
385 |
|
386 |
###########################################
|
387 |
#Chroma DB die splits ablegen - vektorisiert...
|
@@ -580,8 +569,6 @@ def extract_document_info(documents):
|
|
580 |
else:
|
581 |
download_link = doc_path
|
582 |
|
583 |
-
# Prüfe, ob doc_id existiert und weise einen Standardwert zu, falls nicht
|
584 |
-
id = getattr(doc, 'doc_id', None)
|
585 |
|
586 |
info = {
|
587 |
'content': doc.page_content,
|
@@ -590,7 +577,6 @@ def extract_document_info(documents):
|
|
590 |
'seite': doc.metadata.get("page", "Unbekannte Seite"),
|
591 |
'pfad': doc_path,
|
592 |
'download_link': download_link,
|
593 |
-
'id': id
|
594 |
}
|
595 |
extracted_info.append(info)
|
596 |
return extracted_info
|
@@ -735,7 +721,7 @@ class Document:
|
|
735 |
"title": title,
|
736 |
"page": page,
|
737 |
"path": path,
|
738 |
-
"
|
739 |
}
|
740 |
|
741 |
|
|
|
306 |
def split_documents_with_id(docs, text_splitter):
|
307 |
splits = []
|
308 |
for doc in docs:
|
309 |
+
doc_splits = text_splitter.split_text(f"{doc.metadata['title']} {doc.page_content}")
|
310 |
for split_content in doc_splits:
|
311 |
+
split_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID für jeden Split
|
312 |
+
split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], split_id=split_id)
|
313 |
splits.append(split_doc)
|
314 |
return splits
|
315 |
|
|
|
343 |
#loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
|
344 |
#docs.extend(loader.load())
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
|
347 |
################################
|
348 |
# Document splitting
|
349 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
350 |
#splits = text_splitter.split_documents(preprocessed_docs)
|
351 |
|
352 |
+
# Vorverarbeitung der Dokumente
|
353 |
+
# Split der Originaldokumente
|
354 |
+
original_splits = split_documents_with_id(docs, text_splitter)
|
355 |
+
|
356 |
+
# Vorverarbeitung der Originalsplits
|
357 |
+
preprocessed_splits = []
|
358 |
+
for split in original_splits:
|
359 |
+
preprocessed_content = preprocess_text(split.page_content)
|
360 |
+
preprocessed_split = Document(content=preprocessed_content, title=split.metadata["title"], page=split.metadata["page"], path=split.metadata["path"], split_id=split.metadata["split_id"])
|
361 |
+
preprocessed_splits.append(preprocessed_split)
|
362 |
+
|
363 |
|
364 |
+
# Mapping von vorverarbeiteten Splits zu Originalsplits anhand der split_ids
|
365 |
+
split_to_original_mapping = {p_split.metadata["split_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
|
366 |
|
367 |
|
368 |
print("Splits...........................")
|
|
|
370 |
if 'divis' in split.page_content:
|
371 |
print("DIVIS found in chunk:", split)
|
372 |
|
373 |
+
return preprocessed_splits, original_splits, split_to_original_mapping
|
374 |
|
375 |
###########################################
|
376 |
#Chroma DB die splits ablegen - vektorisiert...
|
|
|
569 |
else:
|
570 |
download_link = doc_path
|
571 |
|
|
|
|
|
572 |
|
573 |
info = {
|
574 |
'content': doc.page_content,
|
|
|
577 |
'seite': doc.metadata.get("page", "Unbekannte Seite"),
|
578 |
'pfad': doc_path,
|
579 |
'download_link': download_link,
|
|
|
580 |
}
|
581 |
extracted_info.append(info)
|
582 |
return extracted_info
|
|
|
721 |
"title": title,
|
722 |
"page": page,
|
723 |
"path": path,
|
724 |
+
"split_id": split_id # Füge die ID in die Metadaten ein
|
725 |
}
|
726 |
|
727 |
|