alexkueck commited on
Commit
c947a10
·
verified ·
1 Parent(s): 14df361

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +77 -9
utils.py CHANGED
@@ -54,6 +54,10 @@ from langchain_core.pydantic_v1 import BaseModel, Field
54
  from langchain_core.runnables import RunnablePassthrough
55
  from langchain.text_splitter import RecursiveCharacterTextSplitter
56
  from chromadb.errors import InvalidDimensionException
 
 
 
 
57
  #import io
58
  #from PIL import Image, ImageDraw, ImageOps, ImageFont
59
  #import base64
@@ -201,7 +205,7 @@ def clean_text(text):
201
  ##################################################
202
  ##################################################
203
  # Funktion, um für einen best. File-typ ein directory-loader zu definieren
204
- def create_directory_loader(file_type, directory_path):
205
  #verscheidene Dokument loaders:
206
  loaders = {
207
  '.pdf': PyPDFLoader,
@@ -212,6 +216,64 @@ def create_directory_loader(file_type, directory_path):
212
  glob=f"**/*{file_type}",
213
  loader_cls=loaders[file_type],
214
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  ################################################
216
  #die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
217
  def document_loading_splitting():
@@ -252,9 +314,15 @@ def document_loading_splitting():
252
  ###########################################
253
  #Chroma DB die splits ablegen - vektorisiert...
254
  def document_storage_chroma(splits):
255
- #HF embeddings--------------------------------------
256
- vectorstore = Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
 
 
 
257
  retriever = vectorstore.as_retriever(search_kwargs = {"k": ANZAHL_DOCS})
 
 
 
258
  return vectorstore, retriever
259
 
260
  ############################################
@@ -377,16 +445,16 @@ def extract_document_info(documents):
377
  extracted_info = []
378
  for doc in documents:
379
  info = {
380
- 'content': doc.page_content,
381
- 'page': doc.metadata['page'],
382
- 'path': doc.metadata['source']
 
 
383
  }
384
  extracted_info.append(info)
385
  return extracted_info
386
 
387
-
388
-
389
-
390
 
391
 
392
  ###################################################
 
54
  from langchain_core.runnables import RunnablePassthrough
55
  from langchain.text_splitter import RecursiveCharacterTextSplitter
56
  from chromadb.errors import InvalidDimensionException
57
+ import fitz # PyMuPDF
58
+ import docx
59
+ from langchain.document_loaders import DirectoryLoader
60
+ from langchain.document_loaders.pydantic import Document
61
  #import io
62
  #from PIL import Image, ImageDraw, ImageOps, ImageFont
63
  #import base64
 
205
  ##################################################
206
  ##################################################
207
  # Funktion, um für einen best. File-typ ein directory-loader zu definieren
208
+ def create_directory_loaderBack(file_type, directory_path):
209
  #verscheidene Dokument loaders:
210
  loaders = {
211
  '.pdf': PyPDFLoader,
 
216
  glob=f"**/*{file_type}",
217
  loader_cls=loaders[file_type],
218
  )
219
+
220
+ #besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
221
+ def create_directory_loader(file_type, directory_path):
222
+ loaders = {
223
+ '.pdf': load_pdf_with_metadata,
224
+ '.word': load_word_with_metadata,
225
+ }
226
+
227
+ class CustomLoader:
228
+ def __init__(self, directory_path, file_type, loader_func):
229
+ self.directory_path = directory_path
230
+ self.file_type = file_type
231
+ self.loader_func = loader_func
232
+
233
+ def load(self):
234
+ documents = []
235
+ for root, _, files in os.walk(self.directory_path):
236
+ for file in files:
237
+ if file.endswith(self.file_type):
238
+ file_path = os.path.join(root, file)
239
+ documents.extend(self.loader_func(file_path))
240
+ return documents
241
+
242
+ return CustomLoader(directory_path, file_type, loaders[file_type])
243
+
244
+
245
+ ################################################
246
+ # Custom Loader-Funktionen zu dem DirektoryLoader
247
+ # Custom loader functions
248
+ def load_pdf_with_metadata(file_path):
249
+ document = fitz.open(file_path)
250
+ documents = []
251
+ for page_num in range(len(document)):
252
+ page = document.load_page(page_num)
253
+ content = page.get_text("text")
254
+ metadata = {
255
+ "title": document.metadata.get("title", "Unbekannt"),
256
+ "page": page_num + 1,
257
+ "path": file_path
258
+ }
259
+ documents.append(Document(content=content, metadata=metadata))
260
+ return documents
261
+
262
+ def load_word_with_metadata(file_path):
263
+ document = docx.Document(file_path)
264
+ metadata = {
265
+ "title": "Dokument",
266
+ "path": file_path
267
+ }
268
+ contents = []
269
+ for para in document.paragraphs:
270
+ content = para.text
271
+ # Hier wird keine Seitenzahl verwendet, aber Sie können zusätzliche Logik hinzufügen
272
+ contents.append(Document(content=content, metadata={**metadata, "page": 1}))
273
+ return contents
274
+
275
+
276
+
277
  ################################################
278
  #die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
279
  def document_loading_splitting():
 
314
  ###########################################
315
  #Chroma DB die splits ablegen - vektorisiert...
316
  def document_storage_chroma(splits):
317
+ # Embedding-Funktion definieren
318
+ embedding_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
319
+
320
+ # Vectorstore initialisieren und Dokumente hinzufügen
321
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_fn, persist_directory=CHROMA_DIR)
322
  retriever = vectorstore.as_retriever(search_kwargs = {"k": ANZAHL_DOCS})
323
+ # Persist the vectorstore to disk
324
+ vectorstore.persist()
325
+
326
  return vectorstore, retriever
327
 
328
  ############################################
 
445
  extracted_info = []
446
  for doc in documents:
447
  info = {
448
+ 'content' : doc["content"]
449
+ 'metadaten' : doc["metadata"]
450
+ 'titel' : metadaten.get("title", "Keine Überschrift")
451
+ 'seite' : metadaten.get("page", "Unbekannte Seite")
452
+ 'pfad' : metadaten.get("path", "Kein Pfad verfügbar")
453
  }
454
  extracted_info.append(info)
455
  return extracted_info
456
 
457
+
 
 
458
 
459
 
460
  ###################################################