alexkueck commited on
Commit
3d7818b
·
verified ·
1 Parent(s): c8c1eaa

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +45 -2
utils.py CHANGED
@@ -87,6 +87,7 @@ german_stopwords = set(stopwords.words('german'))
87
  ANZAHL_DOCS = 5
88
  # Konstanten für Datei-Upload
89
  REPO_ID = "alexkueck/kkg_suche"
 
90
  REPO_TYPE = "space"
91
 
92
  ###############################
@@ -330,15 +331,57 @@ def split_documents_with_id(docs, text_splitter):
330
  splits.append(split_doc)
331
  return splits
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  ########################################
334
  #finally die Splits erzeugen und laden.....
335
  def document_loading_splitting():
336
  docs = []
337
  print("Directory Loader neu............................")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  # kreiere einen DirectoryLoader für jeden file type
339
  pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
340
  word_loader = create_directory_loader('.word', CHROMA_WORD)
341
-
342
  # Load the files
343
  pdf_documents = pdf_loader.load()
344
  word_documents = word_loader.load()
@@ -477,7 +520,7 @@ def upload_file_to_huggingface(file_path, upload_path):
477
  api.upload_file(
478
  path_or_fileobj=file_path,
479
  path_in_repo=upload_path,
480
- repo_id=REPO_ID,
481
  repo_type=REPO_TYPE,
482
  token=HF_WRITE
483
  )
 
87
  ANZAHL_DOCS = 5
88
  # Konstanten für Datei-Upload
89
  REPO_ID = "alexkueck/kkg_suche"
90
+ STORAGE_REPO_ID = "alexkueck/kkg_files"
91
  REPO_TYPE = "space"
92
 
93
  ###############################
 
331
  splits.append(split_doc)
332
  return splits
333
 
334
+ #######################################
335
+ # Dokumente aus anderem Space laden
336
+ #######################################
337
+ #ein File aus dem Space mit der REPO_ID laden
338
+ def download_file_from_hf(file_name, save_path):
339
+ url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
340
+ response = requests.get(url)
341
+ response.raise_for_status() # Raise an error for bad status codes
342
+ with open(save_path, 'wb') as file:
343
+ file.write(response.content)
344
+ return save_path
345
+
346
+ #Liste aller Files in dem Space mit der Repo_id
347
+ def list_files_in_hf_repo(repo_id):
348
+ repo_info = api.list_repo_files(repo_id=repo_id)
349
+ return repo_info
350
+
351
+
352
+
353
  ########################################
354
  #finally die Splits erzeugen und laden.....
355
  def document_loading_splitting():
356
  docs = []
357
  print("Directory Loader neu............................")
358
+
359
+
360
+
361
+ # Verzeichnis für heruntergeladene Dateien
362
+ download_dir = "downloaded_files"
363
+ os.makedirs(download_dir, exist_ok=True)
364
+
365
+ # Dateien im Hugging Face Space auflisten
366
+ files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID)
367
+
368
+ # Dateien aus dem Hugging Face Space herunterladen
369
+ for file_name in files_in_repo:
370
+ if file_name.endswith('.pdf') or file_name.endswith('.docx'):
371
+ local_file_path = os.path.join(download_dir, os.path.basename(file_name))
372
+ download_file_from_hf(file_name, local_file_path)
373
+
374
+ # Erstellen von DirectoryLoader für jeden Dateityp
375
+ pdf_loader = create_directory_loader('.pdf', download_dir)
376
+ word_loader = create_directory_loader('.word', download_dir)
377
+
378
+
379
+
380
+ """
381
  # kreiere einen DirectoryLoader für jeden file type
382
  pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
383
  word_loader = create_directory_loader('.word', CHROMA_WORD)
384
+ """
385
  # Load the files
386
  pdf_documents = pdf_loader.load()
387
  word_documents = word_loader.load()
 
520
  api.upload_file(
521
  path_or_fileobj=file_path,
522
  path_in_repo=upload_path,
523
+ repo_id=STORAGE_REPO_ID,
524
  repo_type=REPO_TYPE,
525
  token=HF_WRITE
526
  )