File size: 2,731 Bytes
b14adfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from llama_index.retrievers.bm25 import BM25Retriever 
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import Document

class HybridRetriever:
    def __init__(self, bm25_retriever: BM25Retriever, vector_retriever: VectorIndexRetriever):
        """
        Inıtializes a Hybrid Retriever with BM25Retriever and VectorIndexRetriever.

        Args:
            bm25_retriever (BM25Retriever): An instance of BM25Retriever for keyword-based retrieval.
            vector_retriever (VectorIndexRetriever): An instance of VectorIndexRetriever for vector-based retrieval.
        """
        
        self.bm25_retriever = bm25_retriever
        self.vector_retriever = vector_retriever
        self.top_k = vector_retriever._similarity_top_k + bm25_retriever._similarity_top_k

    def retrieve(self, query: str):
        """
        Retrieves documents relevant to the query using both BM25 and vector retrieval methods.
        
        Args:
            query (str): The query string for which relevant documents are to be retrieved.
        
        Returns:
            list: A list of tuples, each containing the document text and its combined score.
        """
        query = "[INST] " + " [/INST]"
        # Perform keyword search using BM25 retriever
        bm25_results = self.bm25_retriever.retrieve(query)
        
        # Perform vector search using VectorIndexRetriever
        vector_results = self.vector_retriever.retrieve(query)
        
        # Combine results, filter duplicates, and calculate combined scores
        combined_results = {}
        for result in bm25_results:
            combined_results[result.node.text] = {'score': result.score}
        
        for result in vector_results:
            if result.node.text in combined_results:
                combined_results[result.node.text]['score'] += result.score
            else:
                combined_results[result.node.text] = {'score': result.score}

        # Convert combined results to a list of tuples and sort by score
        combined_results_list = sorted(combined_results.items(), key=lambda item: item[1]['score'], reverse=True)
        return combined_results_list # {score, document}
        
    def best_docs(self, query: str):
        """
        Retrieves the most relevant documents to the query as Document objects with their scores.

        Args:
            query (str): The query string for which the most relevant documents are to be retrieved.

        Returns:
            list: A list of tuples, each containing a Document object and its score.
        """
        top_results = self.retrieve(query)
        return [(Document(text=text), score) for text, score in top_results]