ChenyuRabbitLove commited on
Commit
b95388b
·
1 Parent(s): 2e4fd32

refactor/ enable mutiple usage

Browse files
Files changed (4) hide show
  1. app.py +67 -135
  2. utils/chatbot.py +122 -0
  3. utils/utils.py +21 -0
  4. utils/work_flow_controller.py +1 -3
app.py CHANGED
@@ -11,126 +11,25 @@ from openai.embeddings_utils import distances_from_embeddings
11
 
12
  from utils.gpt_processor import QuestionAnswerer
13
  from utils.work_flow_controller import WorkFlowController
14
-
15
- qa_processor = QuestionAnswerer()
16
- CSV_FILE_PATHS = ''
17
- JSON_FILE_PATHS = ''
18
- KNOWLEDGE_BASE = None
19
- CONTEXT = None
20
- CONTEXT_PAGE_NUM = None
21
- CONTEXT_FILE_NAME = None
22
-
23
- def build_knowledge_base(files):
24
- global CSV_FILE_PATHS
25
- global JSON_FILE_PATHS
26
- global KNOWLEDGE_BASE
27
-
28
- work_flow_controller = WorkFlowController(files)
29
- CSV_FILE_PATHS = work_flow_controller.csv_result_path
30
- JSON_FILE_PATHS = work_flow_controller.result_path
31
- with open(CSV_FILE_PATHS, 'r', encoding='UTF-8') as fp:
32
- knowledge_base = pd.read_csv(fp)
33
- knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
34
- KNOWLEDGE_BASE = knowledge_base
35
-
36
- def construct_summary():
37
- with open(JSON_FILE_PATHS, 'r', encoding='UTF-8') as fp:
38
- knowledge_base = json.load(fp)
39
-
40
- context = """"""
41
- for key in knowledge_base.keys():
42
- file_name = knowledge_base[key]['file_name']
43
- total_page = knowledge_base[key]['total_pages']
44
- summary = knowledge_base[key]['summarized_content']
45
- file_context = f"""
46
- ### 文件摘要
47
- {file_name} (共 {total_page} 頁)<br><br>
48
- {summary}<br><br>
49
- """
50
- context += file_context
51
- return context
52
-
53
- def change_md():
54
- content = construct_summary()
55
- return gr.Markdown.update(content, visible=True)
56
-
57
- def user(message, history):
58
- return "", history + [[message, None]]
59
-
60
- def system_notification(action):
61
- if action == 'upload':
62
- return [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
63
- else:
64
- return [['已上傳文件', '文件處理完成,請開始提問']]
65
-
66
- def get_index_file(user_message):
67
- global KNOWLEDGE_BASE
68
- global CONTEXT
69
- global CONTEXT_PAGE_NUM
70
- global CONTEXT_FILE_NAME
71
-
72
- user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
73
- KNOWLEDGE_BASE['distance'] = distances_from_embeddings(user_message_embedding, KNOWLEDGE_BASE['page_embedding'].values, distance_metric='cosine')
74
- KNOWLEDGE_BASE = KNOWLEDGE_BASE.sort_values(by='distance', ascending=True).head(1)
75
- if KNOWLEDGE_BASE['distance'].values[0] > 0.2:
76
- CONTEXT = None
77
- else:
78
-
79
- CONTEXT = KNOWLEDGE_BASE['page_content'].values[0]
80
- CONTEXT_PAGE_NUM = KNOWLEDGE_BASE['page_num'].values[0]
81
- CONTEXT_FILE_NAME = KNOWLEDGE_BASE['file_name'].values[0]
82
-
83
- def bot(history):
84
- user_message = history[-1][0]
85
- global CONTEXT
86
- print(f'user_message: {user_message}')
87
-
88
- if KNOWLEDGE_BASE is None:
89
- response = [
90
- [user_message, "請先上傳文件"],
91
- ]
92
- history = response
93
- return history
94
- elif CONTEXT is None:
95
- get_index_file(user_message)
96
- print(f'CONTEXT: {CONTEXT}')
97
- if CONTEXT is None:
98
- response = [
99
- [user_message, "無法找到相關文件,請重新提問"],
100
- ]
101
- history = response
102
- return history
103
- else:
104
- pass
105
-
106
- if CONTEXT is not None:
107
- bot_message = qa_processor.answer_question(CONTEXT, CONTEXT_PAGE_NUM, CONTEXT_FILE_NAME, history)
108
- print(f'bot_message: {bot_message}')
109
- response = [
110
- [user_message, bot_message],
111
- ]
112
- history[-1] = response[0]
113
- return history
114
 
115
- def clear_state():
116
- global KNOWLEDGE_BASE
117
- global CONTEXT
118
- global CONTEXT_PAGE_NUM
119
- global CONTEXT_FILE_NAME
120
-
121
- CONTEXT = None
122
- CONTEXT_PAGE_NUM = None
123
- CONTEXT_FILE_NAME = None
124
- KNOWLEDGE_BASE = None
125
 
126
  with gr.Blocks() as demo:
127
  history = gr.State([])
128
- upload_state = gr.State("upload")
129
- finished = gr.State("finished")
130
  user_question = gr.State("")
 
 
 
 
 
 
 
131
  with gr.Row():
132
  gr.HTML('Junyi Academy Chatbot')
133
- #status_display = gr.Markdown("Success", elem_id="status_display")
134
  with gr.Row(equal_height=True):
135
  with gr.Column(scale=5):
136
  with gr.Row():
@@ -143,53 +42,86 @@ with gr.Blocks() as demo:
143
  placeholder="Enter text",
144
  container=False,
145
  )
146
- # with gr.Column(min_width=70, scale=1):
147
- # submit_btn = gr.Button("Send")
148
  with gr.Column(min_width=70, scale=1):
149
  clear_btn = gr.Button("清除")
150
  with gr.Column(min_width=70, scale=1):
151
  submit_btn = gr.Button("傳送")
152
 
153
- response = user_input.submit(user,
154
- [user_input, chatbot],
155
- [user_input, chatbot],
156
- queue=False,
157
- ).then(bot, chatbot, chatbot)
158
- response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
159
 
160
- clear_btn.click(lambda: None, None, chatbot, queue=False)
 
 
 
 
 
 
 
 
 
161
 
162
  submit_btn.click(user,
163
  [user_input, chatbot],
164
  [user_input, chatbot],
165
  chatbot,
166
- queue=False).then(bot, chatbot, chatbot).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
 
167
 
168
- clear_btn.click(clear_state, None, None, queue=False)
169
 
170
  with gr.Row():
171
  index_file = gr.File(file_count="multiple", file_types=["pdf"], label="Upload PDF file")
172
 
173
  with gr.Row():
174
  instruction = gr.Markdown("""
175
- ## 使用說明
176
- 1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
177
- 2. 在上方輸入欄輸入問題,系統將自動回覆
178
- 3. 可以根據下方的摘要內容來提問
179
- 4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
180
- 5. 要切換檢索的文件,請點選「清除」按鈕後再重新提問
181
  """)
182
 
183
  with gr.Row():
184
  describe = gr.Markdown('', visible=True)
185
 
186
- index_file.upload(system_notification, [upload_state], chatbot) \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  .then(lambda: gr.update(interactive=True), None, None, queue=False) \
188
- .then(build_knowledge_base, [index_file]) \
189
- .then(system_notification, [finished], chatbot) \
190
  .then(lambda: gr.update(interactive=True), None, None, queue=False) \
191
- .then(change_md, None, describe)
192
-
193
 
194
  if __name__ == "__main__":
195
  demo.launch()
 
11
 
12
  from utils.gpt_processor import QuestionAnswerer
13
  from utils.work_flow_controller import WorkFlowController
14
+ from utils.chatbot import Chatbot
15
+ from utils.utils import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def create_chatbot():
18
+ bot = Chatbot()
19
+ return bot
 
 
 
 
 
 
 
20
 
21
  with gr.Blocks() as demo:
22
  history = gr.State([])
 
 
23
  user_question = gr.State("")
24
+ chatbot_utils = Chatbot()
25
+
26
+ user_chatbot = gr.State(Chatbot())
27
+
28
+ upload_state = gr.State("wating")
29
+ finished = gr.State("finished")
30
+
31
  with gr.Row():
32
  gr.HTML('Junyi Academy Chatbot')
 
33
  with gr.Row(equal_height=True):
34
  with gr.Column(scale=5):
35
  with gr.Row():
 
42
  placeholder="Enter text",
43
  container=False,
44
  )
45
+
 
46
  with gr.Column(min_width=70, scale=1):
47
  clear_btn = gr.Button("清除")
48
  with gr.Column(min_width=70, scale=1):
49
  submit_btn = gr.Button("傳送")
50
 
51
+ bot_args = dict(
52
+ fn=bot,
53
+ inputs=user_chatbot,
54
+ outputs=chatbot,
55
+ )
 
56
 
57
+ user_args = dict(
58
+ fn=user,
59
+ inputs=[user_chatbot, user_input],
60
+ outputs=[user_input, chatbot],
61
+ queue=False,
62
+ )
63
+
64
+ response = user_input.submit(**user_args).then(**bot_args)
65
+
66
+ response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
67
 
68
  submit_btn.click(user,
69
  [user_input, chatbot],
70
  [user_input, chatbot],
71
  chatbot,
72
+ queue=False).then(**bot_args).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
73
+
74
 
 
75
 
76
  with gr.Row():
77
  index_file = gr.File(file_count="multiple", file_types=["pdf"], label="Upload PDF file")
78
 
79
  with gr.Row():
80
  instruction = gr.Markdown("""
81
+ ## 使用說明
82
+ 1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
83
+ 2. 在上方輸入欄輸入問題,系統將自動回覆
84
+ 3. 可以根據下方的摘要內容來提問
85
+ 4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
86
+ 5. 要切換檢索的文件,請點選「清除對話記錄」按鈕後再重新提問
87
  """)
88
 
89
  with gr.Row():
90
  describe = gr.Markdown('', visible=True)
91
 
92
+
93
+ clear_state_args = dict(
94
+ fn=clear_state,
95
+ inputs=user_chatbot,
96
+ outputs=None,
97
+ )
98
+
99
+ clear_btn.click(**clear_state_args)
100
+
101
+ send_system_nofification_args = dict(
102
+ fn=send_system_nofification,
103
+ inputs=user_chatbot,
104
+ outputs=chatbot,
105
+ )
106
+
107
+ bulid_knowledge_base_args = dict(
108
+ fn=build_knowledge_base,
109
+ inputs=[user_chatbot, index_file],
110
+ outputs=None,
111
+ )
112
+
113
+ change_md_args = dict(
114
+ fn=change_md,
115
+ inputs=[user_chatbot],
116
+ outputs=[describe],
117
+ )
118
+
119
+ index_file.upload(**send_system_nofification_args) \
120
  .then(lambda: gr.update(interactive=True), None, None, queue=False) \
121
+ .then(**bulid_knowledge_base_args) \
122
+ .then(**send_system_nofification_args) \
123
  .then(lambda: gr.update(interactive=True), None, None, queue=False) \
124
+ .then(**change_md_args)
 
125
 
126
  if __name__ == "__main__":
127
  demo.launch()
utils/chatbot.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import openai
4
+ import pandas as pd
5
+ import numpy as np
6
+ import gradio as gr
7
+ from openai.embeddings_utils import distances_from_embeddings
8
+
9
+ from .work_flow_controller import WorkFlowController
10
+ from .gpt_processor import QuestionAnswerer
11
+
12
+ class Chatbot():
13
+ def __init__(self) -> None:
14
+ self.history = []
15
+ self.upload_state = 'waiting'
16
+
17
+ self.knowledge_base = None
18
+ self.context = None
19
+ self.context_page_num = None
20
+ self.context_file_name = None
21
+
22
+
23
+ def build_knowledge_base(self, files):
24
+ work_flow_controller = WorkFlowController(files)
25
+ self.csv_result_path = work_flow_controller.csv_result_path
26
+ self.json_result_path = work_flow_controller.json_result_path
27
+
28
+ with open(self.csv_result_path, 'r', encoding='UTF-8') as fp:
29
+ knowledge_base = pd.read_csv(fp)
30
+ knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
31
+
32
+ self.knowledge_base = knowledge_base
33
+ self.upload_state = 'done'
34
+
35
+ def clear_state(self):
36
+ self.context = None
37
+ self.context_page_num = None
38
+ self.context_file_name = None
39
+ self.upload_state = 'waiting'
40
+ self.history = []
41
+
42
+ def send_system_nofification(self):
43
+ if self.upload_state == 'waiting':
44
+ conversation = [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
45
+ return conversation
46
+ elif self.upload_state == 'done':
47
+ conversation = [['已上傳文件', '文件處理完成,請開始提問']]
48
+ return conversation
49
+
50
+ def change_md(self):
51
+ content = self.__construct_summary()
52
+ return gr.Markdown.update(content, visible=True)
53
+
54
+ def __construct_summary(self):
55
+ with open(self.json_result_path, 'r', encoding='UTF-8') as fp:
56
+ knowledge_base = json.load(fp)
57
+
58
+ context = """"""
59
+ for key in knowledge_base.keys():
60
+ file_name = knowledge_base[key]['file_name']
61
+ total_page = knowledge_base[key]['total_pages']
62
+ summary = knowledge_base[key]['summarized_content']
63
+ file_context = f"""
64
+ ### 文件摘要
65
+ {file_name} (共 {total_page} 頁)<br><br>
66
+ {summary}<br><br>
67
+ """
68
+ context += file_context
69
+ return context
70
+
71
+ def user(self, message):
72
+ self.history += [[message, None]]
73
+ return "", self.history
74
+
75
+ def bot(self):
76
+ user_message = self.history[-1][0]
77
+ print(f'user_message: {user_message}')
78
+
79
+ if self.knowledge_base is None:
80
+ response = [
81
+ [user_message, "請先上傳文件"],
82
+ ]
83
+ self.history = response
84
+ return self.history
85
+ elif self.context is None:
86
+ self.__get_index_file(user_message)
87
+ print(f'CONTEXT: {self.context}')
88
+ if self.context is None:
89
+ response = [
90
+ [user_message, "無法找到相關文件,請重新提問"],
91
+ ]
92
+ self.history = response
93
+ return self.history
94
+ else:
95
+ pass
96
+
97
+ if self.context is not None:
98
+ qa_processor = QuestionAnswerer()
99
+ bot_message = qa_processor.answer_question(
100
+ self.context,
101
+ self.context_page_num,
102
+ self.context_file_name,
103
+ self.history
104
+ )
105
+ print(f'bot_message: {bot_message}')
106
+ response = [
107
+ [user_message, bot_message],
108
+ ]
109
+ self.history[-1] = response[0]
110
+ return self.history
111
+
112
+ def __get_index_file(self, user_message):
113
+ user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
114
+ self.knowledge_base['distance'] = distances_from_embeddings(user_message_embedding, self.knowledge_base['page_embedding'].values, distance_metric='cosine')
115
+ self.knowledge_base = self.knowledge_base.sort_values(by='distance', ascending=True).head(1)
116
+
117
+ if self.knowledge_base['distance'].values[0] > 0.2:
118
+ self.context = None
119
+ else:
120
+ self.context = self.knowledge_base['page_content'].values[0]
121
+ self.context_page_num = self.knowledge_base['page_num'].values[0]
122
+ self.context_file_name = self.knowledge_base['file_name'].values[0]
utils/utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def clear_state(chatbot, *args):
3
+ return chatbot.clear_state(*args)
4
+
5
+ def send_system_nofification(chatbot, *args):
6
+ return chatbot.send_system_nofification(*args)
7
+
8
+ def build_knowledge_base(chatbot, *args):
9
+ return chatbot.build_knowledge_base(*args)
10
+
11
+ def change_md(chatbot, *args):
12
+ return chatbot.change_md(*args)
13
+
14
+ def get_index_file(chatbot, *args):
15
+ return chatbot.get_index_file(*args)
16
+
17
+ def user(chatbot, *args):
18
+ return chatbot.user(*args)
19
+
20
+ def bot(chatbot, *args):
21
+ return chatbot.bot(*args)
utils/work_flow_controller.py CHANGED
@@ -84,14 +84,12 @@ class WorkFlowController():
84
  file = self.__translate_to_chinese(file)
85
  file = self.__get_embedding(file)
86
  file = self.__get_summary(file)
87
- # file = self.__get_keywords(file)
88
- # file = self.__get_topics(file)
89
  return file
90
 
91
  def __dump_to_json(self):
92
  with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
93
  print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
94
- self.result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
95
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
96
 
97
  def __construct_knowledge_base_dataframe(self):
 
84
  file = self.__translate_to_chinese(file)
85
  file = self.__get_embedding(file)
86
  file = self.__get_summary(file)
 
 
87
  return file
88
 
89
  def __dump_to_json(self):
90
  with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
91
  print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
92
+ self.json_result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
93
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
94
 
95
  def __construct_knowledge_base_dataframe(self):