import json
import openai
import pandas as pd
import numpy as np
import gradio as gr
from openai.embeddings_utils import distances_from_embeddings
from .work_flow_controller import WorkFlowController
from .gpt_processor import QuestionAnswerer
class Chatbot():
def __init__(self) -> None:
self.history = []
self.upload_state = 'waiting'
self.knowledge_base = None
self.context = None
self.context_page_num = None
self.context_file_name = None
def build_knowledge_base(self, files):
work_flow_controller = WorkFlowController(files)
self.csv_result_path = work_flow_controller.csv_result_path
self.json_result_path = work_flow_controller.json_result_path
with open(self.csv_result_path, 'r', encoding='UTF-8') as fp:
knowledge_base = pd.read_csv(fp)
knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
self.knowledge_base = knowledge_base
self.upload_state = 'done'
def clear_state(self):
self.context = None
self.context_page_num = None
self.context_file_name = None
self.upload_state = 'waiting'
self.history = []
def send_system_nofification(self):
if self.upload_state == 'waiting':
conversation = [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
return conversation
elif self.upload_state == 'done':
conversation = [['已上傳文件', '文件處理完成,請開始提問']]
return conversation
def change_md(self):
content = self.__construct_summary()
return gr.Markdown.update(content, visible=True)
def __construct_summary(self):
with open(self.json_result_path, 'r', encoding='UTF-8') as fp:
knowledge_base = json.load(fp)
context = """"""
for key in knowledge_base.keys():
file_name = knowledge_base[key]['file_name']
total_page = knowledge_base[key]['total_pages']
summary = knowledge_base[key]['summarized_content']
file_context = f"""
### 文件摘要
{file_name} (共 {total_page} 頁)
{summary}
"""
context += file_context
return context
def user(self, message):
self.history += [[message, None]]
return "", self.history
def bot(self):
user_message = self.history[-1][0]
print(f'user_message: {user_message}')
if self.knowledge_base is None:
response = [
[user_message, "請先上傳文件"],
]
self.history = response
return self.history
elif self.context is None:
self.__get_index_file(user_message)
print(f'CONTEXT: {self.context}')
if self.context is None:
response = [
[user_message, "無法找到相關文件,請重新提問"],
]
self.history = response
return self.history
else:
pass
if self.context is not None:
qa_processor = QuestionAnswerer()
bot_message = qa_processor.answer_question(
self.context,
self.context_page_num,
self.context_file_name,
self.history
)
print(f'bot_message: {bot_message}')
response = [
[user_message, bot_message],
]
self.history[-1] = response[0]
return self.history
def __get_index_file(self, user_message):
user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
self.knowledge_base['distance'] = distances_from_embeddings(user_message_embedding, self.knowledge_base['page_embedding'].values, distance_metric='cosine')
self.knowledge_base = self.knowledge_base.sort_values(by='distance', ascending=True).head(1)
if self.knowledge_base['distance'].values[0] > 0.2:
self.context = None
else:
self.context = self.knowledge_base['page_content'].values[0]
self.context_page_num = self.knowledge_base['page_num'].values[0]
self.context_file_name = self.knowledge_base['file_name'].values[0]