Update utils.py
Browse files
utils.py
CHANGED
@@ -60,7 +60,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
60 |
from chromadb.errors import InvalidDimensionException
|
61 |
import fitz # PyMuPDF
|
62 |
import docx
|
63 |
-
from huggingface_hub import hf_hub_download
|
64 |
#import io
|
65 |
#from PIL import Image, ImageDraw, ImageOps, ImageFont
|
66 |
#import base64
|
@@ -314,12 +314,23 @@ def create_directory_loader(file_type, directory_path):
|
|
314 |
|
315 |
def load(self):
|
316 |
documents = []
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
return documents
|
324 |
|
325 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
@@ -401,6 +412,7 @@ def document_loading_splitting():
|
|
401 |
#os.makedirs(download_dir, exist_ok=True)
|
402 |
|
403 |
# Dateien im Hugging Face Space auflisten
|
|
|
404 |
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
|
405 |
print("hier.....................................")
|
406 |
# Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
|
@@ -413,6 +425,15 @@ def document_loading_splitting():
|
|
413 |
download_file_from_hf(file_name, local_file_path)
|
414 |
print("file_name..................."+str(file_name))
|
415 |
print("local_file_path..................."+str(local_file_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
417 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
418 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
|
|
60 |
from chromadb.errors import InvalidDimensionException
|
61 |
import fitz # PyMuPDF
|
62 |
import docx
|
63 |
+
from huggingface_hub import hf_hub_download, list_repo_files
|
64 |
#import io
|
65 |
#from PIL import Image, ImageDraw, ImageOps, ImageFont
|
66 |
#import base64
|
|
|
314 |
|
315 |
def load(self):
|
316 |
documents = []
|
317 |
+
for file_path in self.file_list:
|
318 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
319 |
+
temp_path = temp_file.name
|
320 |
+
|
321 |
+
# Datei aus dem Hugging Face Space herunterladen
|
322 |
+
hf_hub_download(
|
323 |
+
repo_id=STORAGE_REPO_ID,
|
324 |
+
filename=file_path,
|
325 |
+
repo_type="space",
|
326 |
+
local_dir=os.path.dirname(temp_path),
|
327 |
+
local_dir_use_symlinks=False
|
328 |
+
)
|
329 |
+
|
330 |
+
documents.extend(self.loader_func(temp_path))
|
331 |
+
|
332 |
+
# Temporäre Datei löschen
|
333 |
+
os.unlink(temp_path)
|
334 |
return documents
|
335 |
|
336 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
|
|
412 |
#os.makedirs(download_dir, exist_ok=True)
|
413 |
|
414 |
# Dateien im Hugging Face Space auflisten
|
415 |
+
"""
|
416 |
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
|
417 |
print("hier.....................................")
|
418 |
# Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
|
|
|
425 |
download_file_from_hf(file_name, local_file_path)
|
426 |
print("file_name..................."+str(file_name))
|
427 |
print("local_file_path..................."+str(local_file_path))
|
428 |
+
"""
|
429 |
+
|
430 |
+
|
431 |
+
# Dateien im Hugging Face Space auflisten
|
432 |
+
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space")
|
433 |
+
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
|
434 |
+
word_files = [f for f in files_in_repo if f.endswith('.docx') and f.startswith("chroma/kkg/word/")]
|
435 |
+
|
436 |
+
|
437 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
438 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
439 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|