Spaces:
Sleeping
Sleeping
from typing import Optional | |
from pathlib import Path | |
from loguru import logger | |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader | |
class LlamaIndexDB: | |
"""A class to manage document indexing and querying using LlamaIndex. | |
This class provides functionality to add documents from a directory and query the indexed documents. | |
Args: | |
data_dir (str): Directory containing documents to index. Defaults to "docs". | |
**kwargs: Additional arguments passed to SimpleDirectoryReader and VectorStoreIndex. | |
SimpleDirectoryReader kwargs: | |
- filename_as_id (bool): Use filenames as document IDs | |
- recursive (bool): Recursively read subdirectories | |
- required_exts (List[str]): Only read files with these extensions | |
- exclude_hidden (bool): Skip hidden files | |
VectorStoreIndex kwargs: | |
- service_context: Custom service context | |
- embed_model: Custom embedding model | |
- similarity_top_k (int): Number of similar docs to retrieve | |
- store_nodes_override (bool): Override node storage | |
""" | |
def __init__(self, data_dir: str = "docs", **kwargs) -> None: | |
"""Initialize the LlamaIndexDB with an empty index. | |
Args: | |
data_dir (str): Directory containing documents to index | |
**kwargs: Additional arguments for SimpleDirectoryReader and VectorStoreIndex | |
""" | |
self.data_dir = data_dir | |
self.index: Optional[VectorStoreIndex] = None | |
self.reader_kwargs = { | |
k: v | |
for k, v in kwargs.items() | |
if k | |
in SimpleDirectoryReader.__init__.__code__.co_varnames | |
} | |
self.index_kwargs = { | |
k: v | |
for k, v in kwargs.items() | |
if k not in self.reader_kwargs | |
} | |
logger.info("Initialized LlamaIndexDB") | |
data_path = Path(self.data_dir) | |
if not data_path.exists(): | |
logger.error(f"Directory not found: {self.data_dir}") | |
raise FileNotFoundError( | |
f"Directory {self.data_dir} does not exist" | |
) | |
try: | |
documents = SimpleDirectoryReader( | |
self.data_dir, **self.reader_kwargs | |
).load_data() | |
self.index = VectorStoreIndex.from_documents( | |
documents, **self.index_kwargs | |
) | |
logger.success( | |
f"Successfully indexed documents from {self.data_dir}" | |
) | |
except Exception as e: | |
logger.error(f"Error indexing documents: {str(e)}") | |
raise | |
def query(self, query: str, **kwargs) -> str: | |
"""Query the indexed documents. | |
Args: | |
query (str): The query string to search for | |
**kwargs: Additional arguments passed to the query engine | |
- similarity_top_k (int): Number of similar documents to retrieve | |
- streaming (bool): Enable streaming response | |
- response_mode (str): Response synthesis mode | |
- max_tokens (int): Maximum tokens in response | |
Returns: | |
str: The response from the query engine | |
Raises: | |
ValueError: If no documents have been indexed yet | |
""" | |
if self.index is None: | |
logger.error("No documents have been indexed yet") | |
raise ValueError("Must add documents before querying") | |
try: | |
query_engine = self.index.as_query_engine(**kwargs) | |
response = query_engine.query(query) | |
print(response) | |
logger.info(f"Successfully queried: {query}") | |
return str(response) | |
except Exception as e: | |
logger.error(f"Error during query: {str(e)}") | |
raise | |
# # Example usage | |
# llama_index_db = LlamaIndexDB( | |
# data_dir="docs", | |
# filename_as_id=True, | |
# recursive=True, | |
# required_exts=[".txt", ".pdf", ".docx"], | |
# similarity_top_k=3 | |
# ) | |
# response = llama_index_db.query( | |
# "What is the medical history of patient 1?", | |
# streaming=True, | |
# response_mode="compact" | |
# ) | |
# print(response) | |