Update utils.py
Browse files
utils.py
CHANGED
@@ -443,6 +443,35 @@ def extract_document_info(documents):
|
|
443 |
}
|
444 |
extracted_info.append(info)
|
445 |
return extracted_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
|
447 |
|
448 |
|
|
|
443 |
}
|
444 |
extracted_info.append(info)
|
445 |
return extracted_info
|
446 |
+
|
447 |
+
|
448 |
+
|
449 |
+
def extract_document_info(documents):
|
450 |
+
extracted_info = []
|
451 |
+
for doc in documents:
|
452 |
+
# Extract the filename from the path to use as the title
|
453 |
+
filename = os.path.basename(doc.metadata.get("path", ""))
|
454 |
+
title = filename if filename else "Keine Überschrift"
|
455 |
+
|
456 |
+
# Determine the document type and adjust the path accordingly
|
457 |
+
doc_path = doc.metadata.get("path", "")
|
458 |
+
if doc_path.endswith('.pdf'):
|
459 |
+
download_link = f"https://huggingface.co/spaces/alexkueck/SucheRAG/resolve/main/chroma/kkg/pdf/{title}?token=hf_token"
|
460 |
+
elif doc_path.endswith('.docx'):
|
461 |
+
download_link = f"https://huggingface.co/spaces/alexkueck/SucheRAG/resolve/main/chroma/kkg/word/{title}?token=hf_token"
|
462 |
+
else:
|
463 |
+
download_link = doc_path
|
464 |
+
|
465 |
+
info = {
|
466 |
+
'content': doc.page_content,
|
467 |
+
'metadata': doc.metadata,
|
468 |
+
'titel': title,
|
469 |
+
'seite': doc.metadata.get("page", "Unbekannte Seite"),
|
470 |
+
'pfad': doc_path,
|
471 |
+
'download_link': download_link
|
472 |
+
}
|
473 |
+
extracted_info.append(info)
|
474 |
+
return extracted_info
|
475 |
|
476 |
|
477 |
|