Spaces:
Runtime error
Runtime error
makcrx
commited on
Commit
·
9647155
1
Parent(s):
3334b06
update db
Browse files- app.py +1 -1
- faiss_qa_2023-08-09/index.faiss +3 -0
- faiss_qa_2023-08-09/index.pkl +3 -0
- test.ipynb +12 -10
- test.py +64 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ import reranking
|
|
5 |
from extract_keywords import init_keyword_extractor, extract_keywords
|
6 |
|
7 |
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
|
8 |
-
db = FAISS.load_local('
|
9 |
init_keyword_extractor()
|
10 |
|
11 |
def main(query):
|
|
|
5 |
from extract_keywords import init_keyword_extractor, extract_keywords
|
6 |
|
7 |
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
|
8 |
+
db = FAISS.load_local('faiss_qa_2023-08-09', embeddings)
|
9 |
init_keyword_extractor()
|
10 |
|
11 |
def main(query):
|
faiss_qa_2023-08-09/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7690fb7aa21b8d325e3ce1a9f8fb241dc597aa06df042bf242c522433243b93f
|
3 |
+
size 576045
|
faiss_qa_2023-08-09/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09f3705caef79861035ed026855cefb872c386e87938d73a5deb57479507364d
|
3 |
+
size 265781
|
test.ipynb
CHANGED
@@ -9,6 +9,7 @@
|
|
9 |
"import sqlite3, json\n",
|
10 |
"from contextlib import closing\n",
|
11 |
"\n",
|
|
|
12 |
"def load_questions(sqlite_filename):\n",
|
13 |
" all_questions = []\n",
|
14 |
" with closing(sqlite3.connect(sqlite_filename)) as db:\n",
|
@@ -82,30 +83,31 @@
|
|
82 |
},
|
83 |
{
|
84 |
"cell_type": "code",
|
85 |
-
"execution_count":
|
86 |
"metadata": {},
|
87 |
"outputs": [
|
88 |
{
|
89 |
"name": "stderr",
|
90 |
"output_type": "stream",
|
91 |
"text": [
|
|
|
|
|
92 |
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
93 |
" warnings.warn(\n"
|
94 |
]
|
95 |
},
|
96 |
{
|
97 |
-
"
|
98 |
-
|
99 |
-
|
100 |
-
]
|
101 |
-
|
102 |
-
|
103 |
-
"metadata": {},
|
104 |
-
"output_type": "execute_result"
|
105 |
}
|
106 |
],
|
107 |
"source": [
|
108 |
-
"extract_keywords('
|
|
|
109 |
]
|
110 |
},
|
111 |
{
|
|
|
9 |
"import sqlite3, json\n",
|
10 |
"from contextlib import closing\n",
|
11 |
"\n",
|
12 |
+
"# use test.py to update questions in db!!!\n",
|
13 |
"def load_questions(sqlite_filename):\n",
|
14 |
" all_questions = []\n",
|
15 |
" with closing(sqlite3.connect(sqlite_filename)) as db:\n",
|
|
|
83 |
},
|
84 |
{
|
85 |
"cell_type": "code",
|
86 |
+
"execution_count": 5,
|
87 |
"metadata": {},
|
88 |
"outputs": [
|
89 |
{
|
90 |
"name": "stderr",
|
91 |
"output_type": "stream",
|
92 |
"text": [
|
93 |
+
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
94 |
+
" warnings.warn(\n",
|
95 |
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
96 |
" warnings.warn(\n"
|
97 |
]
|
98 |
},
|
99 |
{
|
100 |
+
"name": "stdout",
|
101 |
+
"output_type": "stream",
|
102 |
+
"text": [
|
103 |
+
"['яндекс доставка экспресс']\n",
|
104 |
+
"[]\n"
|
105 |
+
]
|
|
|
|
|
106 |
}
|
107 |
],
|
108 |
"source": [
|
109 |
+
"print(extract_keywords('яд экспресс'))\n",
|
110 |
+
"print(extract_keywords('яндекс.доставка'))"
|
111 |
]
|
112 |
},
|
113 |
{
|
test.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3, json
|
2 |
+
from contextlib import closing
|
3 |
+
|
4 |
+
# change THIS
|
5 |
+
output_dir = 'faiss_qa_2023-08-09'
|
6 |
+
model_name = "multi-qa-MiniLM-L6-cos-v1"
|
7 |
+
|
8 |
+
punctuation = '!"#\'(),:;?[]^`}{'
|
9 |
+
punctuation2 = '-/&._~+*=@<>[]\\'
|
10 |
+
remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
|
11 |
+
|
12 |
+
def add_special_questions(questions):
|
13 |
+
questions.append({
|
14 |
+
"question": "Позови человека/менеджера",
|
15 |
+
"query": "Позови человека/менеджера",
|
16 |
+
"answer": "Переключаю на сотрудника, ожидайте",
|
17 |
+
"articleId": 0,
|
18 |
+
})
|
19 |
+
|
20 |
+
def load_questions(sqlite_filename):
|
21 |
+
all_questions = []
|
22 |
+
with closing(sqlite3.connect(sqlite_filename)) as db:
|
23 |
+
db.row_factory = sqlite3.Row
|
24 |
+
with closing(db.cursor()) as cursor:
|
25 |
+
results = cursor.execute(
|
26 |
+
"SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0",
|
27 |
+
('article',)
|
28 |
+
).fetchall()
|
29 |
+
|
30 |
+
for res in results:
|
31 |
+
|
32 |
+
questions = json.loads(res['questions'])
|
33 |
+
for q in questions:
|
34 |
+
q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
|
35 |
+
q['articleId'] = res['articleId']
|
36 |
+
all_questions += questions
|
37 |
+
|
38 |
+
add_special_questions(all_questions)
|
39 |
+
|
40 |
+
return all_questions
|
41 |
+
|
42 |
+
print("Loading questions from db...")
|
43 |
+
questions = load_questions("omnidesk-ai-chatgpt-questions.sqlite")
|
44 |
+
|
45 |
+
# print(questions[0])
|
46 |
+
|
47 |
+
from langchain.vectorstores import FAISS
|
48 |
+
from langchain.docstore.document import Document
|
49 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
50 |
+
|
51 |
+
docs = [
|
52 |
+
Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] })
|
53 |
+
for q in questions
|
54 |
+
]
|
55 |
+
|
56 |
+
print(f"Loading embeddings model {model_name}...")
|
57 |
+
embeddings = SentenceTransformerEmbeddings(model_name=model_name)
|
58 |
+
|
59 |
+
print("embedding documents...")
|
60 |
+
|
61 |
+
db = FAISS.from_documents(docs, embeddings)
|
62 |
+
db.save_local(output_dir)
|
63 |
+
|
64 |
+
print('Saved!')
|