Update utils.py
Browse files
utils.py
CHANGED
@@ -87,6 +87,7 @@ german_stopwords = set(stopwords.words('german'))
|
|
87 |
ANZAHL_DOCS = 5
|
88 |
# Konstanten für Datei-Upload
|
89 |
REPO_ID = "alexkueck/kkg_suche"
|
|
|
90 |
REPO_TYPE = "space"
|
91 |
|
92 |
###############################
|
@@ -330,15 +331,57 @@ def split_documents_with_id(docs, text_splitter):
|
|
330 |
splits.append(split_doc)
|
331 |
return splits
|
332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
########################################
|
334 |
#finally die Splits erzeugen und laden.....
|
335 |
def document_loading_splitting():
|
336 |
docs = []
|
337 |
print("Directory Loader neu............................")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
# kreiere einen DirectoryLoader für jeden file type
|
339 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
340 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
341 |
-
|
342 |
# Load the files
|
343 |
pdf_documents = pdf_loader.load()
|
344 |
word_documents = word_loader.load()
|
@@ -477,7 +520,7 @@ def upload_file_to_huggingface(file_path, upload_path):
|
|
477 |
api.upload_file(
|
478 |
path_or_fileobj=file_path,
|
479 |
path_in_repo=upload_path,
|
480 |
-
repo_id=
|
481 |
repo_type=REPO_TYPE,
|
482 |
token=HF_WRITE
|
483 |
)
|
|
|
87 |
ANZAHL_DOCS = 5
|
88 |
# Konstanten für Datei-Upload
|
89 |
REPO_ID = "alexkueck/kkg_suche"
|
90 |
+
STORAGE_REPO_ID = "alexkueck/kkg_files"
|
91 |
REPO_TYPE = "space"
|
92 |
|
93 |
###############################
|
|
|
331 |
splits.append(split_doc)
|
332 |
return splits
|
333 |
|
334 |
+
#######################################
|
335 |
+
# Dokumente aus anderem Space laden
|
336 |
+
#######################################
|
337 |
+
#ein File aus dem Space mit der REPO_ID laden
|
338 |
+
def download_file_from_hf(file_name, save_path):
|
339 |
+
url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
|
340 |
+
response = requests.get(url)
|
341 |
+
response.raise_for_status() # Raise an error for bad status codes
|
342 |
+
with open(save_path, 'wb') as file:
|
343 |
+
file.write(response.content)
|
344 |
+
return save_path
|
345 |
+
|
346 |
+
#Liste aller Files in dem Space mit der Repo_id
|
347 |
+
def list_files_in_hf_repo(repo_id):
|
348 |
+
repo_info = api.list_repo_files(repo_id=repo_id)
|
349 |
+
return repo_info
|
350 |
+
|
351 |
+
|
352 |
+
|
353 |
########################################
|
354 |
#finally die Splits erzeugen und laden.....
|
355 |
def document_loading_splitting():
|
356 |
docs = []
|
357 |
print("Directory Loader neu............................")
|
358 |
+
|
359 |
+
|
360 |
+
|
361 |
+
# Verzeichnis für heruntergeladene Dateien
|
362 |
+
download_dir = "downloaded_files"
|
363 |
+
os.makedirs(download_dir, exist_ok=True)
|
364 |
+
|
365 |
+
# Dateien im Hugging Face Space auflisten
|
366 |
+
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID)
|
367 |
+
|
368 |
+
# Dateien aus dem Hugging Face Space herunterladen
|
369 |
+
for file_name in files_in_repo:
|
370 |
+
if file_name.endswith('.pdf') or file_name.endswith('.docx'):
|
371 |
+
local_file_path = os.path.join(download_dir, os.path.basename(file_name))
|
372 |
+
download_file_from_hf(file_name, local_file_path)
|
373 |
+
|
374 |
+
# Erstellen von DirectoryLoader für jeden Dateityp
|
375 |
+
pdf_loader = create_directory_loader('.pdf', download_dir)
|
376 |
+
word_loader = create_directory_loader('.word', download_dir)
|
377 |
+
|
378 |
+
|
379 |
+
|
380 |
+
"""
|
381 |
# kreiere einen DirectoryLoader für jeden file type
|
382 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
383 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
384 |
+
"""
|
385 |
# Load the files
|
386 |
pdf_documents = pdf_loader.load()
|
387 |
word_documents = word_loader.load()
|
|
|
520 |
api.upload_file(
|
521 |
path_or_fileobj=file_path,
|
522 |
path_in_repo=upload_path,
|
523 |
+
repo_id=STORAGE_REPO_ID,
|
524 |
repo_type=REPO_TYPE,
|
525 |
token=HF_WRITE
|
526 |
)
|