alexkueck commited on
Commit
05ab39b
·
verified ·
1 Parent(s): 3fa9bf8

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +10 -5
utils.py CHANGED
@@ -341,19 +341,24 @@ def document_loading_splitting():
341
  "page": doc.metadata["page"],
342
  "path": doc.metadata["path"]
343
  }
344
- preprocessed_docs.append(Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"]))
345
-
 
 
 
 
 
346
 
347
  ################################
348
  # Document splitting
349
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
350
  splits = text_splitter.split_documents(preprocessed_docs)
351
  # Split sowohl für originale als auch für vorverarbeitete Dokumente
352
- original_splits = text_splitter.split_documents(docs)
353
  preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
354
 
355
- # Mapping von vorverarbeiteten Splits zu Originalsplits
356
- split_to_original_mapping = {p_split: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
357
 
358
 
359
  print("Splits...........................")
 
341
  "page": doc.metadata["page"],
342
  "path": doc.metadata["path"]
343
  }
344
+ preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"])
345
+ original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"])
346
+ # Weise IDs zu
347
+ preprocessed_doc.id = str(uuid.uuid4())
348
+ original_doc.id = preprocessed_doc.id
349
+ preprocessed_docs.append(preprocessed_doc)
350
+ original_docs.append(original_doc)
351
 
352
  ################################
353
  # Document splitting
354
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
355
  splits = text_splitter.split_documents(preprocessed_docs)
356
  # Split sowohl für originale als auch für vorverarbeitete Dokumente
357
+ original_splits = text_splitter.split_documents(original_docs)
358
  preprocessed_splits = text_splitter.split_documents(preprocessed_docs)
359
 
360
+ # Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
361
+ split_to_original_mapping = {p_split.id: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
362
 
363
 
364
  print("Splits...........................")