import json import openai import pandas as pd import numpy as np import gradio as gr from openai.embeddings_utils import distances_from_embeddings from .work_flow_controller import WorkFlowController from .gpt_processor import QuestionAnswerer class Chatbot(): def __init__(self) -> None: self.history = [] self.upload_state = 'waiting' self.knowledge_base = None self.context = None self.context_page_num = None self.context_file_name = None def build_knowledge_base(self, files): work_flow_controller = WorkFlowController(files) self.csv_result_path = work_flow_controller.csv_result_path self.json_result_path = work_flow_controller.json_result_path with open(self.csv_result_path, 'r', encoding='UTF-8') as fp: knowledge_base = pd.read_csv(fp) knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array) self.knowledge_base = knowledge_base self.upload_state = 'done' def clear_state(self): self.context = None self.context_page_num = None self.context_file_name = None self.upload_state = 'waiting' self.history = [] def send_system_nofification(self): if self.upload_state == 'waiting': conversation = [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']] return conversation elif self.upload_state == 'done': conversation = [['已上傳文件', '文件處理完成,請開始提問']] return conversation def change_md(self): content = self.__construct_summary() return gr.Markdown.update(content, visible=True) def __construct_summary(self): with open(self.json_result_path, 'r', encoding='UTF-8') as fp: knowledge_base = json.load(fp) context = """""" for key in knowledge_base.keys(): file_name = knowledge_base[key]['file_name'] total_page = knowledge_base[key]['total_pages'] summary = knowledge_base[key]['summarized_content'] file_context = f""" ### 文件摘要 {file_name} (共 {total_page} 頁)

{summary}

""" context += file_context return context def user(self, message): self.history += [[message, None]] return "", self.history def bot(self): user_message = self.history[-1][0] print(f'user_message: {user_message}') if self.knowledge_base is None: response = [ [user_message, "請先上傳文件"], ] self.history = response return self.history elif self.context is None: self.__get_index_file(user_message) print(f'CONTEXT: {self.context}') if self.context is None: response = [ [user_message, "無法找到相關文件,請重新提問"], ] self.history = response return self.history else: pass if self.context is not None: qa_processor = QuestionAnswerer() bot_message = qa_processor.answer_question( self.context, self.context_page_num, self.context_file_name, self.history ) print(f'bot_message: {bot_message}') response = [ [user_message, bot_message], ] self.history[-1] = response[0] return self.history def __get_index_file(self, user_message): user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding'] self.knowledge_base['distance'] = distances_from_embeddings(user_message_embedding, self.knowledge_base['page_embedding'].values, distance_metric='cosine') self.knowledge_base = self.knowledge_base.sort_values(by='distance', ascending=True).head(1) if self.knowledge_base['distance'].values[0] > 0.2: self.context = None else: self.context = self.knowledge_base['page_content'].values[0] self.context_page_num = self.knowledge_base['page_num'].values[0] self.context_file_name = self.knowledge_base['file_name'].values[0]