alexkueck commited on
Commit
46e3881
·
verified ·
1 Parent(s): 39251ae

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +18 -32
utils.py CHANGED
@@ -306,9 +306,10 @@ def load_word_with_metadata(file_path):
306
  def split_documents_with_id(docs, text_splitter):
307
  splits = []
308
  for doc in docs:
309
- doc_splits = text_splitter.split_text(doc.page_content)
310
  for split_content in doc_splits:
311
- split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc.metadata["doc_id"])
 
312
  splits.append(split_doc)
313
  return splits
314
 
@@ -342,38 +343,26 @@ def document_loading_splitting():
342
  #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
343
  #docs.extend(loader.load())
344
 
345
-
346
- # Vorverarbeitung der Dokumente
347
- preprocessed_docs = []
348
- original_docs = []
349
-
350
- for doc in docs:
351
- doc_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID
352
- preprocessed_content = preprocess_text(doc.page_content)
353
- preprocessed_title = preprocess_text(doc.metadata["title"])
354
- preprocessed_metadata = {
355
- "title": preprocessed_title,
356
- "page": doc.metadata["page"],
357
- "path": doc.metadata["path"],
358
- "doc_id": doc_id # Füge die ID in die Metadaten ein
359
- }
360
- preprocessed_doc = Document(content=preprocessed_content, title=preprocessed_metadata["title"], page=preprocessed_metadata["page"], path=preprocessed_metadata["path"], doc_id=doc_id)
361
- original_doc = Document(content=doc.page_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], doc_id=doc_id)
362
- preprocessed_docs.append(preprocessed_doc)
363
- original_docs.append(original_doc)
364
- print("orgin doc....................................."+str(original_doc))
365
 
366
  ################################
367
  # Document splitting
368
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
369
  #splits = text_splitter.split_documents(preprocessed_docs)
370
 
371
- # Split sowohl für originale als auch für vorverarbeitete Dokumente
372
- original_splits = split_documents_with_id(original_docs, text_splitter)
373
- preprocessed_splits = split_documents_with_id(preprocessed_docs, text_splitter)
 
 
 
 
 
 
 
 
374
 
375
- # Mapping von vorverarbeiteten Splits zu Originalsplits anhand der IDs
376
- split_to_original_mapping = {p_split.metadata["doc_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
377
 
378
 
379
  print("Splits...........................")
@@ -381,7 +370,7 @@ def document_loading_splitting():
381
  if 'divis' in split.page_content:
382
  print("DIVIS found in chunk:", split)
383
 
384
- return preprocessed_splits, split_to_original_mapping
385
 
386
  ###########################################
387
  #Chroma DB die splits ablegen - vektorisiert...
@@ -580,8 +569,6 @@ def extract_document_info(documents):
580
  else:
581
  download_link = doc_path
582
 
583
- # Prüfe, ob doc_id existiert und weise einen Standardwert zu, falls nicht
584
- id = getattr(doc, 'doc_id', None)
585
 
586
  info = {
587
  'content': doc.page_content,
@@ -590,7 +577,6 @@ def extract_document_info(documents):
590
  'seite': doc.metadata.get("page", "Unbekannte Seite"),
591
  'pfad': doc_path,
592
  'download_link': download_link,
593
- 'id': id
594
  }
595
  extracted_info.append(info)
596
  return extracted_info
@@ -735,7 +721,7 @@ class Document:
735
  "title": title,
736
  "page": page,
737
  "path": path,
738
- "doc_id": doc_id # Füge die ID in die Metadaten ein
739
  }
740
 
741
 
 
306
  def split_documents_with_id(docs, text_splitter):
307
  splits = []
308
  for doc in docs:
309
+ doc_splits = text_splitter.split_text(f"{doc.metadata['title']} {doc.page_content}")
310
  for split_content in doc_splits:
311
+ split_id = str(uuid.uuid4()) # Erzeuge eine eindeutige ID für jeden Split
312
+ split_doc = Document(content=split_content, title=doc.metadata["title"], page=doc.metadata["page"], path=doc.metadata["path"], split_id=split_id)
313
  splits.append(split_doc)
314
  return splits
315
 
 
343
  #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,YOUTUBE_URL_2], PATH_WORK + YOUTUBE_DIR), OpenAIWhisperParser())
344
  #docs.extend(loader.load())
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  ################################
348
  # Document splitting
349
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)# RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
350
  #splits = text_splitter.split_documents(preprocessed_docs)
351
 
352
+ # Vorverarbeitung der Dokumente
353
+ # Split der Originaldokumente
354
+ original_splits = split_documents_with_id(docs, text_splitter)
355
+
356
+ # Vorverarbeitung der Originalsplits
357
+ preprocessed_splits = []
358
+ for split in original_splits:
359
+ preprocessed_content = preprocess_text(split.page_content)
360
+ preprocessed_split = Document(content=preprocessed_content, title=split.metadata["title"], page=split.metadata["page"], path=split.metadata["path"], split_id=split.metadata["split_id"])
361
+ preprocessed_splits.append(preprocessed_split)
362
+
363
 
364
+ # Mapping von vorverarbeiteten Splits zu Originalsplits anhand der split_ids
365
+ split_to_original_mapping = {p_split.metadata["split_id"]: o_split for p_split, o_split in zip(preprocessed_splits, original_splits)}
366
 
367
 
368
  print("Splits...........................")
 
370
  if 'divis' in split.page_content:
371
  print("DIVIS found in chunk:", split)
372
 
373
+ return preprocessed_splits, original_splits, split_to_original_mapping
374
 
375
  ###########################################
376
  #Chroma DB die splits ablegen - vektorisiert...
 
569
  else:
570
  download_link = doc_path
571
 
 
 
572
 
573
  info = {
574
  'content': doc.page_content,
 
577
  'seite': doc.metadata.get("page", "Unbekannte Seite"),
578
  'pfad': doc_path,
579
  'download_link': download_link,
 
580
  }
581
  extracted_info.append(info)
582
  return extracted_info
 
721
  "title": title,
722
  "page": page,
723
  "path": path,
724
+ "split_id": split_id # Füge die ID in die Metadaten ein
725
  }
726
 
727