Update utils.py
Browse files
utils.py
CHANGED
@@ -24,7 +24,7 @@ import uuid
|
|
24 |
|
25 |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, GPTNeoForCausalLM, GPT2Tokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
|
26 |
from sentence_transformers import SentenceTransformer, util
|
27 |
-
from huggingface_hub import HfApi
|
28 |
from typing import List, Dict
|
29 |
|
30 |
import gradio as gr
|
@@ -265,6 +265,22 @@ def clean_text(text):
|
|
265 |
# Directory Loader Konfigurieren um Text zu extrahieren
|
266 |
##################################################
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
269 |
def create_directory_loader(file_type, directory_path):
|
270 |
loaders = {
|
@@ -272,7 +288,9 @@ def create_directory_loader(file_type, directory_path):
|
|
272 |
'.word': load_word_with_metadata,
|
273 |
}
|
274 |
|
|
|
275 |
class CustomLoader:
|
|
|
276 |
def __init__(self, directory_path, file_type, loader_func):
|
277 |
self.directory_path = directory_path
|
278 |
self.file_type = file_type
|
@@ -288,7 +306,23 @@ def create_directory_loader(file_type, directory_path):
|
|
288 |
return documents
|
289 |
|
290 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
################################################
|
294 |
# Custom Loader-Funktionen zu dem DirektoryLoader
|
|
|
24 |
|
25 |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, GPTNeoForCausalLM, GPT2Tokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
|
26 |
from sentence_transformers import SentenceTransformer, util
|
27 |
+
from huggingface_hub import HfApi, hf_hub_download
|
28 |
from typing import List, Dict
|
29 |
|
30 |
import gradio as gr
|
|
|
265 |
# Directory Loader Konfigurieren um Text zu extrahieren
|
266 |
##################################################
|
267 |
|
268 |
+
def access_pdf(self, filename):
|
269 |
+
# Temporäre Datei erstellen
|
270 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
271 |
+
temp_path = temp_file.name
|
272 |
+
|
273 |
+
# Datei aus dem Hugging Face Space herunterladen
|
274 |
+
hf_hub_download(
|
275 |
+
repo_id=DATA_REPO_ID,
|
276 |
+
filename=os.path.join(self.directory_path, filename),
|
277 |
+
repo_type=DATA_REPO_TYPE,
|
278 |
+
local_dir=os.path.dirname(temp_path),
|
279 |
+
local_dir_use_symlinks=False
|
280 |
+
)
|
281 |
+
|
282 |
+
return temp_path
|
283 |
+
|
284 |
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
285 |
def create_directory_loader(file_type, directory_path):
|
286 |
loaders = {
|
|
|
288 |
'.word': load_word_with_metadata,
|
289 |
}
|
290 |
|
291 |
+
|
292 |
class CustomLoader:
|
293 |
+
"""
|
294 |
def __init__(self, directory_path, file_type, loader_func):
|
295 |
self.directory_path = directory_path
|
296 |
self.file_type = file_type
|
|
|
306 |
return documents
|
307 |
|
308 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
309 |
+
"""
|
310 |
+
def __init__(self, directory_path, file_type, loader_func):
|
311 |
+
self.directory_path = directory_path
|
312 |
+
self.file_type = file_type
|
313 |
+
self.loader_func = loader_func
|
314 |
+
|
315 |
+
def load(self):
|
316 |
+
documents = []
|
317 |
+
# Annahme: directory_path ist jetzt ein Pfad innerhalb des Hugging Face Spaces
|
318 |
+
files = self.list_files_in_hf_space(self.directory_path)
|
319 |
+
for file in files:
|
320 |
+
if file.endswith(self.file_type):
|
321 |
+
file_path = self.access_pdf(file)
|
322 |
+
documents.extend(self.loader_func(file_path))
|
323 |
+
return documents
|
324 |
+
|
325 |
+
return CustomLoader(directory_path, file_type, loaders[file_type])
|
326 |
|
327 |
################################################
|
328 |
# Custom Loader-Funktionen zu dem DirektoryLoader
|