File size: 3,267 Bytes
c6be0a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ff9981
97dc9e6
13eedc1
 
 
4739026
13eedc1
 
 
 
 
 
f92ea85
13eedc1
 
 
 
 
0f96e61
c6be0a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import os
import numpy as np
from openai import OpenAI

class HadithSearch:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
        self.data = None

    def _cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def _get_embedding(self, text, model="text-embedding-ada-002"):
        try:
            text = text.replace("\n", " ")
        except Exception as e:
            pass
        response = self.client.chat.completions.create(
                      model="gpt-3.5-turbo",
                      messages=[
                        {
                          "role": "system",
                          "content": "Your task is to transform a described situation into a list of the top 3 most important things to look for in a database of Islamic hadith that could be helpful to bring answers. \n\nIt should be very specific and formatted with only the list and remove all occurences of the word 'Hadiths', just the topics sought. JSON FORMAT!\n\nThe goal is to use this list to perform cosine similarity embedding search on the hadith database. You provide this list in French"
                        },
                        {
                          "role": "user",
                            "content": text
                        }
                      ],
                      temperature=1,
                      max_tokens=684,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0
                    ).choices[0].message.content
        return self.client.embeddings.create(input=f"{response}", model=model).data[0].embedding

    def load_data_from_json(self, json_file):
        self.data = pd.read_json(json_file)

    def search_hadiths(self, user_input, num_hadiths=10):
        if self.data is None:
            raise ValueError("Data not loaded. Use load_data_from_json method to load data.")
        
        embedding_column_name = "embeddings"
        try:
            self.data[embedding_column_name] = self.data.embeddings.apply(lambda x: x["embeding"])
        except Exception as e:
            pass

        embedding = self._get_embedding(user_input, model='text-embedding-ada-002')
        self.data['similarity'] = self.data.embeddings.apply(lambda x: self._cosine_similarity(x, embedding))

        results = self.data.sort_values('similarity', ascending=False).head(int(num_hadiths)).to_dict(orient="records")
        formatted_results = self._format_results(results)
        return formatted_results

    def _format_results(self, results):
        formatted_output = ""
        for idx, result in enumerate(results, start=1):
            similarity_percentage = str(round(result["similarity"] * 100, 2)) + "%"
            book = result["book"]
            chapter = result["chapter"]
            content = result["content"]
            display = (
                f"## Hadith numéro {idx}: Pertinence par rapport à votre situation : {similarity_percentage}\n"
                f"## Livre : {book}\n"
                f"## Chapitre : {chapter}\n{content}\n\n------\n\n"
            )
            formatted_output += display
        return formatted_output