Update utils.py
Browse files
utils.py
CHANGED
@@ -186,8 +186,6 @@ modell_rag = DistilBertForQuestionAnswering.from_pretrained(HF_MODELL)
|
|
186 |
tokenizer_rag = DistilBertTokenizer.from_pretrained(HF_MODELL)
|
187 |
qa_pipeline = pipeline("question-answering", model=modell_rag, tokenizer=tokenizer_rag)
|
188 |
|
189 |
-
|
190 |
-
|
191 |
HF_MODELL ="EleutherAI/gpt-neo-2.7B"
|
192 |
modell_rag = GPTNeoForCausalLM.from_pretrained(HF_MODELL)
|
193 |
tokenizer_rag = GPT2Tokenizer.from_pretrained(HF_MODELL)
|
@@ -280,7 +278,8 @@ def access_pdf(self, filename):
|
|
280 |
)
|
281 |
|
282 |
return temp_path
|
283 |
-
|
|
|
284 |
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
285 |
def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
|
286 |
loaders = {
|
@@ -290,7 +289,7 @@ def create_custom_loader(file_type, file_list): #create_directory_loader(file_t
|
|
290 |
|
291 |
|
292 |
class CustomLoader:
|
293 |
-
|
294 |
def __init__(self, directory_path, file_type, loader_func):
|
295 |
self.directory_path = directory_path
|
296 |
self.file_type = file_type
|
@@ -306,35 +305,14 @@ def create_custom_loader(file_type, file_list): #create_directory_loader(file_t
|
|
306 |
return documents
|
307 |
|
308 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
309 |
-
|
310 |
-
|
311 |
-
def __init__(self, file_type, file_list, loader_func):
|
312 |
-
self.file_type = file_type
|
313 |
-
self.file_list = file_list
|
314 |
-
self.loader_func = loader_func
|
315 |
-
|
316 |
-
def load(self):
|
317 |
-
documents = []
|
318 |
-
for file_path in self.file_list:
|
319 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
320 |
-
temp_path = temp_file.name
|
321 |
-
|
322 |
-
# Datei aus dem Hugging Face Space herunterladen
|
323 |
-
hf_hub_download(
|
324 |
-
repo_id=STORAGE_REPO_ID,
|
325 |
-
filename=file_path,
|
326 |
-
repo_type="space",
|
327 |
-
local_dir=os.path.dirname(temp_path),
|
328 |
-
local_dir_use_symlinks=False,
|
329 |
-
token=hf_token
|
330 |
-
)
|
331 |
-
|
332 |
-
documents.extend(self.loader_func(temp_path))
|
333 |
-
|
334 |
-
# Temporäre Datei löschen
|
335 |
-
os.unlink(temp_path)
|
336 |
-
return documents
|
337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
return CustomLoader(file_type, file_list, loaders[file_type])
|
339 |
|
340 |
################################################
|
@@ -906,6 +884,15 @@ def get_filename(file_pfad):
|
|
906 |
return result
|
907 |
|
908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
#################################################
|
910 |
#Klasse mit zuständen - z.B. für interrupt wenn Stop gedrückt...
|
911 |
#################################################
|
@@ -932,14 +919,35 @@ class Document:
|
|
932 |
}
|
933 |
|
934 |
|
935 |
-
|
936 |
-
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
-
|
942 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
943 |
|
944 |
|
945 |
|
|
|
186 |
tokenizer_rag = DistilBertTokenizer.from_pretrained(HF_MODELL)
|
187 |
qa_pipeline = pipeline("question-answering", model=modell_rag, tokenizer=tokenizer_rag)
|
188 |
|
|
|
|
|
189 |
HF_MODELL ="EleutherAI/gpt-neo-2.7B"
|
190 |
modell_rag = GPTNeoForCausalLM.from_pretrained(HF_MODELL)
|
191 |
tokenizer_rag = GPT2Tokenizer.from_pretrained(HF_MODELL)
|
|
|
278 |
)
|
279 |
|
280 |
return temp_path
|
281 |
+
|
282 |
+
"""
|
283 |
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
284 |
def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
|
285 |
loaders = {
|
|
|
289 |
|
290 |
|
291 |
class CustomLoader:
|
292 |
+
|
293 |
def __init__(self, directory_path, file_type, loader_func):
|
294 |
self.directory_path = directory_path
|
295 |
self.file_type = file_type
|
|
|
305 |
return documents
|
306 |
|
307 |
return CustomLoader(directory_path, file_type, loaders[file_type])
|
308 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
+
|
311 |
+
def create_custom_loader(file_type, file_list):
|
312 |
+
loaders = {
|
313 |
+
'.pdf': load_pdf_with_metadata,
|
314 |
+
'.docx': load_word_with_metadata,
|
315 |
+
}
|
316 |
return CustomLoader(file_type, file_list, loaders[file_type])
|
317 |
|
318 |
################################################
|
|
|
884 |
return result
|
885 |
|
886 |
|
887 |
+
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
|
888 |
+
for stop_word in stop_words:
|
889 |
+
if s.endswith(stop_word):
|
890 |
+
return True
|
891 |
+
for i in range(1, len(stop_word)):
|
892 |
+
if s.endswith(stop_word[:i]):
|
893 |
+
return True
|
894 |
+
return False
|
895 |
+
|
896 |
#################################################
|
897 |
#Klasse mit zuständen - z.B. für interrupt wenn Stop gedrückt...
|
898 |
#################################################
|
|
|
919 |
}
|
920 |
|
921 |
|
922 |
+
##########################################
|
923 |
+
#Class für die Directory Loader - um sie anzupassen
|
924 |
+
##########################################
|
925 |
+
class CustomLoader:
|
926 |
+
def __init__(self, file_type, file_list, loader_func):
|
927 |
+
self.file_type = file_type
|
928 |
+
self.file_list = file_list
|
929 |
+
self.loader_func = loader_func
|
930 |
+
|
931 |
+
def load(self):
|
932 |
+
documents = []
|
933 |
+
for file_path in self.file_list:
|
934 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
935 |
+
temp_path = temp_file.name
|
936 |
+
|
937 |
+
# Datei aus dem Hugging Face Space herunterladen
|
938 |
+
hf_hub_download(
|
939 |
+
repo_id=STORAGE_REPO_ID,
|
940 |
+
filename=file_path,
|
941 |
+
repo_type="space",
|
942 |
+
local_dir=os.path.dirname(temp_path),
|
943 |
+
local_dir_use_symlinks=False,
|
944 |
+
token=hf_token
|
945 |
+
)
|
946 |
+
documents.extend(self.loader_func(temp_path))
|
947 |
+
os.unlink(temp_path)
|
948 |
+
return documents
|
949 |
+
|
950 |
+
|
951 |
|
952 |
|
953 |
|