alexkueck commited on
Commit
d353227
·
verified ·
1 Parent(s): 4eb5270

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +5 -5
utils.py CHANGED
@@ -255,7 +255,7 @@ def load_pdf_with_metadata(file_path):
255
  "page": page_num + 1,
256
  "path": file_path
257
  }
258
- documents.append({"content": content, "metadata": metadata})
259
  return documents
260
 
261
  def load_word_with_metadata(file_path):
@@ -267,7 +267,7 @@ def load_word_with_metadata(file_path):
267
  contents = []
268
  for para in document.paragraphs:
269
  content = para.text
270
- contents.append({"content": content, "metadata": {**metadata, "page": 1}})
271
  return contents
272
 
273
 
@@ -400,7 +400,7 @@ def rag_chain(llm, prompt, retriever):
400
  #RAG_CHAIN_PROMPT = PromptTemplate(template="Context: {context}\n\nQuestion: {question}\n\nAnswer:")
401
 
402
  # Inahlte Abrufen der relevanten Dokumente
403
- doc_contents = [doc["content"] for doc in extracted_docs]
404
 
405
  #Berechne die Ähnlichkeiten und finde das relevanteste Dokument
406
  question_embedding = embedder_modell.encode(prompt, convert_to_tensor=True)
@@ -412,7 +412,7 @@ def rag_chain(llm, prompt, retriever):
412
  most_relevant_docs = [extracted_docs[i] for i in most_relevant_doc_indices]
413
 
414
  #Kombiniere die Inhalte aller relevanten Dokumente
415
- combined_content = " ".join([doc["content"] for doc in most_relevant_docs])
416
 
417
  #Formuliere die Eingabe für das Generierungsmodell
418
  input_text = f"frage: {prompt} kontext: {combined_content}"
@@ -444,7 +444,7 @@ def extract_document_info(documents):
444
  extracted_info = []
445
  for doc in documents:
446
  info = {
447
- 'content' : doc["content"],
448
  'metadaten' : doc["metadata"],
449
  'titel' : metadaten.get("title", "Keine Überschrift"),
450
  'seite' : metadaten.get("page", "Unbekannte Seite"),
 
255
  "page": page_num + 1,
256
  "path": file_path
257
  }
258
+ documents.append({"page_content": content, "metadata": metadata})
259
  return documents
260
 
261
  def load_word_with_metadata(file_path):
 
267
  contents = []
268
  for para in document.paragraphs:
269
  content = para.text
270
+ contents.append({"page_content": content, "metadata": {**metadata, "page": 1}})
271
  return contents
272
 
273
 
 
400
  #RAG_CHAIN_PROMPT = PromptTemplate(template="Context: {context}\n\nQuestion: {question}\n\nAnswer:")
401
 
402
  # Inahlte Abrufen der relevanten Dokumente
403
+ doc_contents = [doc["page_content"] for doc in extracted_docs]
404
 
405
  #Berechne die Ähnlichkeiten und finde das relevanteste Dokument
406
  question_embedding = embedder_modell.encode(prompt, convert_to_tensor=True)
 
412
  most_relevant_docs = [extracted_docs[i] for i in most_relevant_doc_indices]
413
 
414
  #Kombiniere die Inhalte aller relevanten Dokumente
415
+ combined_content = " ".join([doc["page_content"] for doc in most_relevant_docs])
416
 
417
  #Formuliere die Eingabe für das Generierungsmodell
418
  input_text = f"frage: {prompt} kontext: {combined_content}"
 
444
  extracted_info = []
445
  for doc in documents:
446
  info = {
447
+ 'content' : doc["page_content"],
448
  'metadaten' : doc["metadata"],
449
  'titel' : metadaten.get("title", "Keine Überschrift"),
450
  'seite' : metadaten.get("page", "Unbekannte Seite"),