Update utils.py
Browse files
utils.py
CHANGED
@@ -255,7 +255,7 @@ def load_pdf_with_metadata(file_path):
|
|
255 |
"page": page_num + 1,
|
256 |
"path": file_path
|
257 |
}
|
258 |
-
documents.append({"
|
259 |
return documents
|
260 |
|
261 |
def load_word_with_metadata(file_path):
|
@@ -267,7 +267,7 @@ def load_word_with_metadata(file_path):
|
|
267 |
contents = []
|
268 |
for para in document.paragraphs:
|
269 |
content = para.text
|
270 |
-
contents.append({"
|
271 |
return contents
|
272 |
|
273 |
|
@@ -400,7 +400,7 @@ def rag_chain(llm, prompt, retriever):
|
|
400 |
#RAG_CHAIN_PROMPT = PromptTemplate(template="Context: {context}\n\nQuestion: {question}\n\nAnswer:")
|
401 |
|
402 |
# Inahlte Abrufen der relevanten Dokumente
|
403 |
-
doc_contents = [doc["
|
404 |
|
405 |
#Berechne die Ähnlichkeiten und finde das relevanteste Dokument
|
406 |
question_embedding = embedder_modell.encode(prompt, convert_to_tensor=True)
|
@@ -412,7 +412,7 @@ def rag_chain(llm, prompt, retriever):
|
|
412 |
most_relevant_docs = [extracted_docs[i] for i in most_relevant_doc_indices]
|
413 |
|
414 |
#Kombiniere die Inhalte aller relevanten Dokumente
|
415 |
-
combined_content = " ".join([doc["
|
416 |
|
417 |
#Formuliere die Eingabe für das Generierungsmodell
|
418 |
input_text = f"frage: {prompt} kontext: {combined_content}"
|
@@ -444,7 +444,7 @@ def extract_document_info(documents):
|
|
444 |
extracted_info = []
|
445 |
for doc in documents:
|
446 |
info = {
|
447 |
-
'content' : doc["
|
448 |
'metadaten' : doc["metadata"],
|
449 |
'titel' : metadaten.get("title", "Keine Überschrift"),
|
450 |
'seite' : metadaten.get("page", "Unbekannte Seite"),
|
|
|
255 |
"page": page_num + 1,
|
256 |
"path": file_path
|
257 |
}
|
258 |
+
documents.append({"page_content": content, "metadata": metadata})
|
259 |
return documents
|
260 |
|
261 |
def load_word_with_metadata(file_path):
|
|
|
267 |
contents = []
|
268 |
for para in document.paragraphs:
|
269 |
content = para.text
|
270 |
+
contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
|
271 |
return contents
|
272 |
|
273 |
|
|
|
400 |
#RAG_CHAIN_PROMPT = PromptTemplate(template="Context: {context}\n\nQuestion: {question}\n\nAnswer:")
|
401 |
|
402 |
# Inahlte Abrufen der relevanten Dokumente
|
403 |
+
doc_contents = [doc["page_content"] for doc in extracted_docs]
|
404 |
|
405 |
#Berechne die Ähnlichkeiten und finde das relevanteste Dokument
|
406 |
question_embedding = embedder_modell.encode(prompt, convert_to_tensor=True)
|
|
|
412 |
most_relevant_docs = [extracted_docs[i] for i in most_relevant_doc_indices]
|
413 |
|
414 |
#Kombiniere die Inhalte aller relevanten Dokumente
|
415 |
+
combined_content = " ".join([doc["page_content"] for doc in most_relevant_docs])
|
416 |
|
417 |
#Formuliere die Eingabe für das Generierungsmodell
|
418 |
input_text = f"frage: {prompt} kontext: {combined_content}"
|
|
|
444 |
extracted_info = []
|
445 |
for doc in documents:
|
446 |
info = {
|
447 |
+
'content' : doc["page_content"],
|
448 |
'metadaten' : doc["metadata"],
|
449 |
'titel' : metadaten.get("title", "Keine Überschrift"),
|
450 |
'seite' : metadaten.get("page", "Unbekannte Seite"),
|