Adr740 commited on
Commit
c6be0a9
·
1 Parent(s): c21749c

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. embeded_data.json +3 -0
  3. get_similar_hadiths.py +55 -0
  4. sahih.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ embeded_data.json filter=lfs diff=lfs merge=lfs -text
embeded_data.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad8208fd5ff051c60c23288fc096d1ee5449951d679b6b103edf8efd07dd7721
3
+ size 95939294
get_similar_hadiths.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import numpy as np
4
+ from openai import OpenAI
5
+
6
+ class HadithSearch:
7
+ def __init__(self, api_key):
8
+ self.client = OpenAI(api_key=api_key)
9
+ self.data = None
10
+
11
+ def _cosine_similarity(self, a, b):
12
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
13
+
14
+ def _get_embedding(self, text, model="text-embedding-ada-002"):
15
+ try:
16
+ text = text.replace("\n", " ")
17
+ except Exception as e:
18
+ pass
19
+ text = "Ceci est le ressenti d'un musulman et nous cherchons les hadiths qui peuvent l'aider: " + text
20
+ return self.client.embeddings.create(input=text, model=model).data[0].embedding
21
+
22
+ def load_data_from_json(self, json_file):
23
+ self.data = pd.read_json(json_file)
24
+
25
+ def search_hadiths(self, user_input, num_hadiths=10):
26
+ if self.data is None:
27
+ raise ValueError("Data not loaded. Use load_data_from_json method to load data.")
28
+
29
+ embedding_column_name = "embeddings"
30
+ try:
31
+ self.data[embedding_column_name] = self.data.embeddings.apply(lambda x: x["embeding"])
32
+ except Exception as e:
33
+ pass
34
+
35
+ embedding = self._get_embedding(user_input, model='text-embedding-ada-002')
36
+ self.data['similarity'] = self.data.embeddings.apply(lambda x: self._cosine_similarity(x, embedding))
37
+
38
+ results = self.data.sort_values('similarity', ascending=False).head(int(num_hadiths)).to_dict(orient="records")
39
+ formatted_results = self._format_results(results)
40
+ return formatted_results
41
+
42
+ def _format_results(self, results):
43
+ formatted_output = ""
44
+ for idx, result in enumerate(results, start=1):
45
+ similarity_percentage = str(round(result["similarity"] * 100, 2)) + "%"
46
+ book = result["book"]
47
+ chapter = result["chapter"]
48
+ content = result["content"]
49
+ display = (
50
+ f"## Hadith numéro {idx}: Pertinence par rapport à votre situation : {similarity_percentage}\n"
51
+ f"## Livre : {book}\n"
52
+ f"## Chapitre : {chapter}\n{content}\n\n------\n\n"
53
+ )
54
+ formatted_output += display
55
+ return formatted_output
sahih.json ADDED
The diff for this file is too large to render. See raw diff