Spaces:
Sleeping
Sleeping
File size: 4,208 Bytes
d8d14f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from typing import Optional
from pathlib import Path
from loguru import logger
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
class LlamaIndexDB:
"""A class to manage document indexing and querying using LlamaIndex.
This class provides functionality to add documents from a directory and query the indexed documents.
Args:
data_dir (str): Directory containing documents to index. Defaults to "docs".
**kwargs: Additional arguments passed to SimpleDirectoryReader and VectorStoreIndex.
SimpleDirectoryReader kwargs:
- filename_as_id (bool): Use filenames as document IDs
- recursive (bool): Recursively read subdirectories
- required_exts (List[str]): Only read files with these extensions
- exclude_hidden (bool): Skip hidden files
VectorStoreIndex kwargs:
- service_context: Custom service context
- embed_model: Custom embedding model
- similarity_top_k (int): Number of similar docs to retrieve
- store_nodes_override (bool): Override node storage
"""
def __init__(self, data_dir: str = "docs", **kwargs) -> None:
"""Initialize the LlamaIndexDB with an empty index.
Args:
data_dir (str): Directory containing documents to index
**kwargs: Additional arguments for SimpleDirectoryReader and VectorStoreIndex
"""
self.data_dir = data_dir
self.index: Optional[VectorStoreIndex] = None
self.reader_kwargs = {
k: v
for k, v in kwargs.items()
if k
in SimpleDirectoryReader.__init__.__code__.co_varnames
}
self.index_kwargs = {
k: v
for k, v in kwargs.items()
if k not in self.reader_kwargs
}
logger.info("Initialized LlamaIndexDB")
data_path = Path(self.data_dir)
if not data_path.exists():
logger.error(f"Directory not found: {self.data_dir}")
raise FileNotFoundError(
f"Directory {self.data_dir} does not exist"
)
try:
documents = SimpleDirectoryReader(
self.data_dir, **self.reader_kwargs
).load_data()
self.index = VectorStoreIndex.from_documents(
documents, **self.index_kwargs
)
logger.success(
f"Successfully indexed documents from {self.data_dir}"
)
except Exception as e:
logger.error(f"Error indexing documents: {str(e)}")
raise
def query(self, query: str, **kwargs) -> str:
"""Query the indexed documents.
Args:
query (str): The query string to search for
**kwargs: Additional arguments passed to the query engine
- similarity_top_k (int): Number of similar documents to retrieve
- streaming (bool): Enable streaming response
- response_mode (str): Response synthesis mode
- max_tokens (int): Maximum tokens in response
Returns:
str: The response from the query engine
Raises:
ValueError: If no documents have been indexed yet
"""
if self.index is None:
logger.error("No documents have been indexed yet")
raise ValueError("Must add documents before querying")
try:
query_engine = self.index.as_query_engine(**kwargs)
response = query_engine.query(query)
print(response)
logger.info(f"Successfully queried: {query}")
return str(response)
except Exception as e:
logger.error(f"Error during query: {str(e)}")
raise
# # Example usage
# llama_index_db = LlamaIndexDB(
# data_dir="docs",
# filename_as_id=True,
# recursive=True,
# required_exts=[".txt", ".pdf", ".docx"],
# similarity_top_k=3
# )
# response = llama_index_db.query(
# "What is the medical history of patient 1?",
# streaming=True,
# response_mode="compact"
# )
# print(response)
|