alexkueck commited on
Commit
b48a2c8
·
verified ·
1 Parent(s): 1b849d5

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -7
utils.py CHANGED
@@ -60,7 +60,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
60
  from chromadb.errors import InvalidDimensionException
61
  import fitz # PyMuPDF
62
  import docx
63
- from huggingface_hub import hf_hub_download
64
  #import io
65
  #from PIL import Image, ImageDraw, ImageOps, ImageFont
66
  #import base64
@@ -314,12 +314,23 @@ def create_directory_loader(file_type, directory_path):
314
 
315
  def load(self):
316
  documents = []
317
- # Annahme: directory_path ist jetzt ein Pfad innerhalb des Hugging Face Spaces
318
- files = self.list_files_in_hf_space(self.directory_path)
319
- for file in files:
320
- if file.endswith(self.file_type):
321
- file_path = self.access_pdf(file)
322
- documents.extend(self.loader_func(file_path))
 
 
 
 
 
 
 
 
 
 
 
323
  return documents
324
 
325
  return CustomLoader(directory_path, file_type, loaders[file_type])
@@ -401,6 +412,7 @@ def document_loading_splitting():
401
  #os.makedirs(download_dir, exist_ok=True)
402
 
403
  # Dateien im Hugging Face Space auflisten
 
404
  files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
405
  print("hier.....................................")
406
  # Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
@@ -413,6 +425,15 @@ def document_loading_splitting():
413
  download_file_from_hf(file_name, local_file_path)
414
  print("file_name..................."+str(file_name))
415
  print("local_file_path..................."+str(local_file_path))
 
 
 
 
 
 
 
 
 
416
  # Erstellen von DirectoryLoader für jeden Dateityp
417
  pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
418
  word_loader = create_directory_loader('.word', CHROMA_WORD)
 
60
  from chromadb.errors import InvalidDimensionException
61
  import fitz # PyMuPDF
62
  import docx
63
+ from huggingface_hub import hf_hub_download, list_repo_files
64
  #import io
65
  #from PIL import Image, ImageDraw, ImageOps, ImageFont
66
  #import base64
 
314
 
315
  def load(self):
316
  documents = []
317
+ for file_path in self.file_list:
318
+ with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
319
+ temp_path = temp_file.name
320
+
321
+ # Datei aus dem Hugging Face Space herunterladen
322
+ hf_hub_download(
323
+ repo_id=STORAGE_REPO_ID,
324
+ filename=file_path,
325
+ repo_type="space",
326
+ local_dir=os.path.dirname(temp_path),
327
+ local_dir_use_symlinks=False
328
+ )
329
+
330
+ documents.extend(self.loader_func(temp_path))
331
+
332
+ # Temporäre Datei löschen
333
+ os.unlink(temp_path)
334
  return documents
335
 
336
  return CustomLoader(directory_path, file_type, loaders[file_type])
 
412
  #os.makedirs(download_dir, exist_ok=True)
413
 
414
  # Dateien im Hugging Face Space auflisten
415
+ """
416
  files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
417
  print("hier.....................................")
418
  # Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
 
425
  download_file_from_hf(file_name, local_file_path)
426
  print("file_name..................."+str(file_name))
427
  print("local_file_path..................."+str(local_file_path))
428
+ """
429
+
430
+
431
+ # Dateien im Hugging Face Space auflisten
432
+ files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space")
433
+ pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
434
+ word_files = [f for f in files_in_repo if f.endswith('.docx') and f.startswith("chroma/kkg/word/")]
435
+
436
+
437
  # Erstellen von DirectoryLoader für jeden Dateityp
438
  pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
439
  word_loader = create_directory_loader('.word', CHROMA_WORD)