alexkueck commited on
Commit
3c7a9bc
·
verified ·
1 Parent(s): f057509

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +36 -2
utils.py CHANGED
@@ -24,7 +24,7 @@ import uuid
24
 
25
  from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, GPTNeoForCausalLM, GPT2Tokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
26
  from sentence_transformers import SentenceTransformer, util
27
- from huggingface_hub import HfApi
28
  from typing import List, Dict
29
 
30
  import gradio as gr
@@ -265,6 +265,22 @@ def clean_text(text):
265
  # Directory Loader Konfigurieren um Text zu extrahieren
266
  ##################################################
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  #besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
269
  def create_directory_loader(file_type, directory_path):
270
  loaders = {
@@ -272,7 +288,9 @@ def create_directory_loader(file_type, directory_path):
272
  '.word': load_word_with_metadata,
273
  }
274
 
 
275
  class CustomLoader:
 
276
  def __init__(self, directory_path, file_type, loader_func):
277
  self.directory_path = directory_path
278
  self.file_type = file_type
@@ -288,7 +306,23 @@ def create_directory_loader(file_type, directory_path):
288
  return documents
289
 
290
  return CustomLoader(directory_path, file_type, loaders[file_type])
291
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  ################################################
294
  # Custom Loader-Funktionen zu dem DirektoryLoader
 
24
 
25
  from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, GPTNeoForCausalLM, GPT2Tokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
26
  from sentence_transformers import SentenceTransformer, util
27
+ from huggingface_hub import HfApi, hf_hub_download
28
  from typing import List, Dict
29
 
30
  import gradio as gr
 
265
  # Directory Loader Konfigurieren um Text zu extrahieren
266
  ##################################################
267
 
268
+ def access_pdf(self, filename):
269
+ # Temporäre Datei erstellen
270
+ with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
271
+ temp_path = temp_file.name
272
+
273
+ # Datei aus dem Hugging Face Space herunterladen
274
+ hf_hub_download(
275
+ repo_id=DATA_REPO_ID,
276
+ filename=os.path.join(self.directory_path, filename),
277
+ repo_type=DATA_REPO_TYPE,
278
+ local_dir=os.path.dirname(temp_path),
279
+ local_dir_use_symlinks=False
280
+ )
281
+
282
+ return temp_path
283
+
284
  #besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
285
  def create_directory_loader(file_type, directory_path):
286
  loaders = {
 
288
  '.word': load_word_with_metadata,
289
  }
290
 
291
+
292
  class CustomLoader:
293
+ """
294
  def __init__(self, directory_path, file_type, loader_func):
295
  self.directory_path = directory_path
296
  self.file_type = file_type
 
306
  return documents
307
 
308
  return CustomLoader(directory_path, file_type, loaders[file_type])
309
+ """
310
+ def __init__(self, directory_path, file_type, loader_func):
311
+ self.directory_path = directory_path
312
+ self.file_type = file_type
313
+ self.loader_func = loader_func
314
+
315
+ def load(self):
316
+ documents = []
317
+ # Annahme: directory_path ist jetzt ein Pfad innerhalb des Hugging Face Spaces
318
+ files = self.list_files_in_hf_space(self.directory_path)
319
+ for file in files:
320
+ if file.endswith(self.file_type):
321
+ file_path = self.access_pdf(file)
322
+ documents.extend(self.loader_func(file_path))
323
+ return documents
324
+
325
+ return CustomLoader(directory_path, file_type, loaders[file_type])
326
 
327
  ################################################
328
  # Custom Loader-Funktionen zu dem DirektoryLoader