File size: 4,208 Bytes
d8d14f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from typing import Optional
from pathlib import Path
from loguru import logger
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


class LlamaIndexDB:
    """A class to manage document indexing and querying using LlamaIndex.

    This class provides functionality to add documents from a directory and query the indexed documents.

    Args:
        data_dir (str): Directory containing documents to index. Defaults to "docs".
        **kwargs: Additional arguments passed to SimpleDirectoryReader and VectorStoreIndex.
            SimpleDirectoryReader kwargs:
                - filename_as_id (bool): Use filenames as document IDs
                - recursive (bool): Recursively read subdirectories
                - required_exts (List[str]): Only read files with these extensions
                - exclude_hidden (bool): Skip hidden files

            VectorStoreIndex kwargs:
                - service_context: Custom service context
                - embed_model: Custom embedding model
                - similarity_top_k (int): Number of similar docs to retrieve
                - store_nodes_override (bool): Override node storage
    """

    def __init__(self, data_dir: str = "docs", **kwargs) -> None:
        """Initialize the LlamaIndexDB with an empty index.

        Args:
            data_dir (str): Directory containing documents to index
            **kwargs: Additional arguments for SimpleDirectoryReader and VectorStoreIndex
        """
        self.data_dir = data_dir
        self.index: Optional[VectorStoreIndex] = None
        self.reader_kwargs = {
            k: v
            for k, v in kwargs.items()
            if k
            in SimpleDirectoryReader.__init__.__code__.co_varnames
        }
        self.index_kwargs = {
            k: v
            for k, v in kwargs.items()
            if k not in self.reader_kwargs
        }

        logger.info("Initialized LlamaIndexDB")
        data_path = Path(self.data_dir)
        if not data_path.exists():
            logger.error(f"Directory not found: {self.data_dir}")
            raise FileNotFoundError(
                f"Directory {self.data_dir} does not exist"
            )

        try:
            documents = SimpleDirectoryReader(
                self.data_dir, **self.reader_kwargs
            ).load_data()
            self.index = VectorStoreIndex.from_documents(
                documents, **self.index_kwargs
            )
            logger.success(
                f"Successfully indexed documents from {self.data_dir}"
            )
        except Exception as e:
            logger.error(f"Error indexing documents: {str(e)}")
            raise

    def query(self, query: str, **kwargs) -> str:
        """Query the indexed documents.

        Args:
            query (str): The query string to search for
            **kwargs: Additional arguments passed to the query engine
                - similarity_top_k (int): Number of similar documents to retrieve
                - streaming (bool): Enable streaming response
                - response_mode (str): Response synthesis mode
                - max_tokens (int): Maximum tokens in response

        Returns:
            str: The response from the query engine

        Raises:
            ValueError: If no documents have been indexed yet
        """
        if self.index is None:
            logger.error("No documents have been indexed yet")
            raise ValueError("Must add documents before querying")

        try:
            query_engine = self.index.as_query_engine(**kwargs)
            response = query_engine.query(query)
            print(response)
            logger.info(f"Successfully queried: {query}")
            return str(response)
        except Exception as e:
            logger.error(f"Error during query: {str(e)}")
            raise


# # Example usage
# llama_index_db = LlamaIndexDB(
#     data_dir="docs",
#     filename_as_id=True,
#     recursive=True,
#     required_exts=[".txt", ".pdf", ".docx"],
#     similarity_top_k=3
# )
# response = llama_index_db.query(
#     "What is the medical history of patient 1?",
#     streaming=True,
#     response_mode="compact"
# )
# print(response)