Update utils.py
Browse files
utils.py
CHANGED
@@ -336,6 +336,7 @@ def document_loading_splitting():
|
|
336 |
preprocessed_docs = []
|
337 |
original_docs = []
|
338 |
for doc in docs:
|
|
|
339 |
preprocessed_content = preprocess_text(doc.page_content)
|
340 |
preprocessed_title = preprocess_text(doc.metadata["title"])
|
341 |
preprocessed_metadata = {
|
@@ -343,11 +344,8 @@ def document_loading_splitting():
|
|
343 |
"page": doc.metadata["page"],
|
344 |
"path": doc.metadata["path"]
|
345 |
}
|
346 |
-
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"])
|
347 |
-
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"])
|
348 |
-
# Weise IDs zu
|
349 |
-
preprocessed_doc.id = str(uuid.uuid4())
|
350 |
-
original_doc.id = preprocessed_doc.id
|
351 |
preprocessed_docs.append(preprocessed_doc)
|
352 |
original_docs.append(original_doc)
|
353 |
|
@@ -710,14 +708,14 @@ shared_state = State()
|
|
710 |
|
711 |
#Für die relevanten Dokumente - damit sie passend zum Dictionary die Attribute haben
|
712 |
class Document:
|
713 |
-
def __init__(self, content, title, page, path):
|
714 |
self.page_content = content
|
715 |
self.metadata = {
|
716 |
"title": title,
|
717 |
"page": page,
|
718 |
"path": path
|
719 |
}
|
720 |
-
self.id =
|
721 |
|
722 |
|
723 |
|
|
|
336 |
preprocessed_docs = []
|
337 |
original_docs = []
|
338 |
for doc in docs:
|
339 |
+
doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
|
340 |
preprocessed_content = preprocess_text(doc.page_content)
|
341 |
preprocessed_title = preprocess_text(doc.metadata["title"])
|
342 |
preprocessed_metadata = {
|
|
|
344 |
"page": doc.metadata["page"],
|
345 |
"path": doc.metadata["path"]
|
346 |
}
|
347 |
+
preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
|
348 |
+
original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
|
|
|
|
|
|
|
349 |
preprocessed_docs.append(preprocessed_doc)
|
350 |
original_docs.append(original_doc)
|
351 |
|
|
|
708 |
|
709 |
#Für die relevanten Dokumente - damit sie passend zum Dictionary die Attribute haben
|
710 |
class Document:
|
711 |
+
def __init__(self, content, title, page, path, doc_id=None):
|
712 |
self.page_content = content
|
713 |
self.metadata = {
|
714 |
"title": title,
|
715 |
"page": page,
|
716 |
"path": path
|
717 |
}
|
718 |
+
self.id = doc_id
|
719 |
|
720 |
|
721 |
|