ChenyuRabbitLove commited on
Commit
26f62c4
·
1 Parent(s): e873140

fix/ format and modify __get_index_file sequence

Browse files
app.py CHANGED
@@ -1,35 +1,14 @@
1
- import json
2
- import time
3
- import random
4
- import os
5
-
6
- import openai
7
  import gradio as gr
8
- import pandas as pd
9
- import numpy as np
10
- from openai.embeddings_utils import distances_from_embeddings
11
 
12
- from utils.gpt_processor import QuestionAnswerer
13
- from utils.work_flow_controller import WorkFlowController
14
  from utils.chatbot import Chatbot
15
  from utils.utils import *
16
-
17
- def create_chatbot():
18
- bot = Chatbot()
19
- return bot
20
 
 
21
  with gr.Blocks() as demo:
22
- history = gr.State([])
23
- user_question = gr.State("")
24
- chatbot_utils = Chatbot()
25
-
26
  user_chatbot = gr.State(Chatbot())
27
-
28
- upload_state = gr.State("wating")
29
- finished = gr.State("finished")
30
 
31
  with gr.Row():
32
- gr.HTML('Junyi Academy Chatbot')
33
  with gr.Row(equal_height=True):
34
  with gr.Column(scale=5):
35
  with gr.Row():
@@ -48,48 +27,55 @@ with gr.Blocks() as demo:
48
  with gr.Column(min_width=70, scale=1):
49
  submit_btn = gr.Button("傳送")
50
 
51
- bot_args = dict(
52
- fn=bot,
53
- inputs=user_chatbot,
54
- outputs=chatbot,
55
- )
56
-
57
- user_args = dict(
58
- fn=user,
59
- inputs=[user_chatbot, user_input],
60
- outputs=[user_input, chatbot],
61
- queue=False,
62
- )
63
-
64
- response = user_input.submit(**user_args).then(**bot_args)
65
-
66
- response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
67
-
68
- submit_btn.click(user,
69
- [user_input, chatbot],
70
- [user_input, chatbot],
71
- chatbot,
72
- queue=False).then(**bot_args).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
73
-
74
-
75
-
76
  with gr.Row():
77
- index_file = gr.File(file_count="multiple", file_types=["pdf"], label="Upload PDF file")
78
-
 
 
79
  with gr.Row():
80
- instruction = gr.Markdown("""
 
81
  ## 使用說明
82
  1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
83
  2. 在上方輸入欄輸入問題,系統將自動回覆
84
  3. 可以根據下方的摘要內容來提問
85
  4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
86
- 5. 要切換檢索的文件,請點選「清除對話記錄」按鈕後再重新提問
87
- """)
 
88
 
89
  with gr.Row():
90
- describe = gr.Markdown('', visible=True)
 
 
91
 
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  clear_state_args = dict(
94
  fn=clear_state,
95
  inputs=user_chatbot,
@@ -98,6 +84,7 @@ with gr.Blocks() as demo:
98
 
99
  clear_btn.click(**clear_state_args)
100
 
 
101
  send_system_nofification_args = dict(
102
  fn=send_system_nofification,
103
  inputs=user_chatbot,
@@ -116,12 +103,13 @@ with gr.Blocks() as demo:
116
  outputs=[describe],
117
  )
118
 
119
- index_file.upload(**send_system_nofification_args) \
120
- .then(lambda: gr.update(interactive=True), None, None, queue=False) \
121
- .then(**bulid_knowledge_base_args) \
122
- .then(**send_system_nofification_args) \
123
- .then(lambda: gr.update(interactive=True), None, None, queue=False) \
124
- .then(**change_md_args)
125
-
 
126
  if __name__ == "__main__":
127
  demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
 
 
 
3
  from utils.chatbot import Chatbot
4
  from utils.utils import *
 
 
 
 
5
 
6
+ # start of gradio interface
7
  with gr.Blocks() as demo:
 
 
 
 
8
  user_chatbot = gr.State(Chatbot())
 
 
 
9
 
10
  with gr.Row():
11
+ gr.HTML("Junyi Academy Chatbot")
12
  with gr.Row(equal_height=True):
13
  with gr.Column(scale=5):
14
  with gr.Row():
 
27
  with gr.Column(min_width=70, scale=1):
28
  submit_btn = gr.Button("傳送")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with gr.Row():
31
+ index_file = gr.File(
32
+ file_count="multiple", file_types=["pdf"], label="Upload PDF file"
33
+ )
34
+
35
  with gr.Row():
36
+ instruction = gr.Markdown(
37
+ """
38
  ## 使用說明
39
  1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
40
  2. 在上方輸入欄輸入問題,系統將自動回覆
41
  3. 可以根據下方的摘要內容來提問
42
  4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
43
+ 5. 要切換檢索的文件,請點選「清除」按鈕後再重新提問
44
+ """
45
+ )
46
 
47
  with gr.Row():
48
+ describe = gr.Markdown("", visible=True)
49
+
50
+ # end of gradio interface
51
 
52
+ # start of workflow controller
53
+
54
+ # defining workflow of user bot interaction
55
+ bot_args = dict(
56
+ fn=bot,
57
+ inputs=user_chatbot,
58
+ outputs=chatbot,
59
+ )
60
+
61
+ user_args = dict(
62
+ fn=user,
63
+ inputs=[user_chatbot, user_input],
64
+ outputs=[user_input, chatbot],
65
+ queue=False,
66
+ )
67
+
68
+ response = user_input.submit(**user_args).then(**bot_args)
69
+
70
+ response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
71
+
72
+ submit_btn.click(
73
+ **user_args,
74
+ ).then(
75
+ **bot_args
76
+ ).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
77
+
78
+ # defining workflow of clear state
79
  clear_state_args = dict(
80
  fn=clear_state,
81
  inputs=user_chatbot,
 
84
 
85
  clear_btn.click(**clear_state_args)
86
 
87
+ # defining workflow of building knowledge base
88
  send_system_nofification_args = dict(
89
  fn=send_system_nofification,
90
  inputs=user_chatbot,
 
103
  outputs=[describe],
104
  )
105
 
106
+ index_file.upload(**send_system_nofification_args).then(
107
+ lambda: gr.update(interactive=True), None, None, queue=False
108
+ ).then(**bulid_knowledge_base_args).then(**send_system_nofification_args).then(
109
+ lambda: gr.update(interactive=True), None, None, queue=False
110
+ ).then(
111
+ **change_md_args
112
+ )
113
+
114
  if __name__ == "__main__":
115
  demo.launch()
utils/chatbot.py CHANGED
@@ -9,57 +9,59 @@ from openai.embeddings_utils import distances_from_embeddings
9
  from .work_flow_controller import WorkFlowController
10
  from .gpt_processor import QuestionAnswerer
11
 
12
- class Chatbot():
 
13
  def __init__(self) -> None:
14
  self.history = []
15
- self.upload_state = 'waiting'
16
-
17
  self.knowledge_base = None
18
  self.context = None
19
  self.context_page_num = None
20
  self.context_file_name = None
21
-
22
 
23
  def build_knowledge_base(self, files):
24
  work_flow_controller = WorkFlowController(files)
25
  self.csv_result_path = work_flow_controller.csv_result_path
26
  self.json_result_path = work_flow_controller.json_result_path
27
 
28
- with open(self.csv_result_path, 'r', encoding='UTF-8') as fp:
29
  knowledge_base = pd.read_csv(fp)
30
- knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
 
 
31
 
32
  self.knowledge_base = knowledge_base
33
- self.upload_state = 'done'
34
 
35
  def clear_state(self):
36
  self.context = None
37
  self.context_page_num = None
38
  self.context_file_name = None
39
- self.upload_state = 'waiting'
40
  self.history = []
41
 
42
  def send_system_nofification(self):
43
- if self.upload_state == 'waiting':
44
- conversation = [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
45
  return conversation
46
- elif self.upload_state == 'done':
47
- conversation = [['已上傳文件', '文件處理完成,請開始提問']]
48
  return conversation
49
-
50
  def change_md(self):
51
  content = self.__construct_summary()
52
  return gr.Markdown.update(content, visible=True)
53
-
54
  def __construct_summary(self):
55
- with open(self.json_result_path, 'r', encoding='UTF-8') as fp:
56
  knowledge_base = json.load(fp)
57
 
58
  context = """"""
59
  for key in knowledge_base.keys():
60
- file_name = knowledge_base[key]['file_name']
61
- total_page = knowledge_base[key]['total_pages']
62
- summary = knowledge_base[key]['summarized_content']
63
  file_context = f"""
64
  ### 文件摘要
65
  {file_name} (共 {total_page} 頁)<br><br>
@@ -67,14 +69,14 @@ class Chatbot():
67
  """
68
  context += file_context
69
  return context
70
-
71
  def user(self, message):
72
  self.history += [[message, None]]
73
  return "", self.history
74
-
75
  def bot(self):
76
  user_message = self.history[-1][0]
77
- print(f'user_message: {user_message}')
78
 
79
  if self.knowledge_base is None:
80
  response = [
@@ -82,41 +84,47 @@ class Chatbot():
82
  ]
83
  self.history = response
84
  return self.history
85
- elif self.context is None:
 
86
  self.__get_index_file(user_message)
87
- print(f'CONTEXT: {self.context}')
88
  if self.context is None:
89
  response = [
90
  [user_message, "無法找到相關文件,請重新提問"],
91
  ]
92
  self.history = response
93
  return self.history
94
- else:
95
- pass
96
-
97
- if self.context is not None:
98
- qa_processor = QuestionAnswerer()
99
- bot_message = qa_processor.answer_question(
100
- self.context,
101
- self.context_page_num,
102
- self.context_file_name,
103
- self.history
104
- )
105
- print(f'bot_message: {bot_message}')
106
- response = [
107
- [user_message, bot_message],
108
- ]
109
- self.history[-1] = response[0]
110
- return self.history
111
-
112
  def __get_index_file(self, user_message):
113
- user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
114
- self.knowledge_base['distance'] = distances_from_embeddings(user_message_embedding, self.knowledge_base['page_embedding'].values, distance_metric='cosine')
115
- self.knowledge_base = self.knowledge_base.sort_values(by='distance', ascending=True).head(1)
 
 
 
 
 
 
 
 
 
116
 
117
- if self.knowledge_base['distance'].values[0] > 0.2:
118
  self.context = None
119
  else:
120
- self.context = self.knowledge_base['page_content'].values[0]
121
- self.context_page_num = self.knowledge_base['page_num'].values[0]
122
- self.context_file_name = self.knowledge_base['file_name'].values[0]
 
9
  from .work_flow_controller import WorkFlowController
10
  from .gpt_processor import QuestionAnswerer
11
 
12
+
13
+ class Chatbot:
14
  def __init__(self) -> None:
15
  self.history = []
16
+ self.upload_state = "waiting"
17
+
18
  self.knowledge_base = None
19
  self.context = None
20
  self.context_page_num = None
21
  self.context_file_name = None
 
22
 
23
  def build_knowledge_base(self, files):
24
  work_flow_controller = WorkFlowController(files)
25
  self.csv_result_path = work_flow_controller.csv_result_path
26
  self.json_result_path = work_flow_controller.json_result_path
27
 
28
+ with open(self.csv_result_path, "r", encoding="UTF-8") as fp:
29
  knowledge_base = pd.read_csv(fp)
30
+ knowledge_base["page_embedding"] = (
31
+ knowledge_base["page_embedding"].apply(eval).apply(np.array)
32
+ )
33
 
34
  self.knowledge_base = knowledge_base
35
+ self.upload_state = "done"
36
 
37
  def clear_state(self):
38
  self.context = None
39
  self.context_page_num = None
40
  self.context_file_name = None
41
+ self.upload_state = "waiting"
42
  self.history = []
43
 
44
  def send_system_nofification(self):
45
+ if self.upload_state == "waiting":
46
+ conversation = [["已上傳文件", "文件處理中(摘要、翻譯等),結束後將自動回覆"]]
47
  return conversation
48
+ elif self.upload_state == "done":
49
+ conversation = [["已上傳文件", "文件處理完成,請開始提問"]]
50
  return conversation
51
+
52
  def change_md(self):
53
  content = self.__construct_summary()
54
  return gr.Markdown.update(content, visible=True)
55
+
56
  def __construct_summary(self):
57
+ with open(self.json_result_path, "r", encoding="UTF-8") as fp:
58
  knowledge_base = json.load(fp)
59
 
60
  context = """"""
61
  for key in knowledge_base.keys():
62
+ file_name = knowledge_base[key]["file_name"]
63
+ total_page = knowledge_base[key]["total_pages"]
64
+ summary = knowledge_base[key]["summarized_content"]
65
  file_context = f"""
66
  ### 文件摘要
67
  {file_name} (共 {total_page} 頁)<br><br>
 
69
  """
70
  context += file_context
71
  return context
72
+
73
  def user(self, message):
74
  self.history += [[message, None]]
75
  return "", self.history
76
+
77
  def bot(self):
78
  user_message = self.history[-1][0]
79
+ print(f"user_message: {user_message}")
80
 
81
  if self.knowledge_base is None:
82
  response = [
 
84
  ]
85
  self.history = response
86
  return self.history
87
+
88
+ else:
89
  self.__get_index_file(user_message)
 
90
  if self.context is None:
91
  response = [
92
  [user_message, "無法找到相關文件,請重新提問"],
93
  ]
94
  self.history = response
95
  return self.history
96
+ else:
97
+ qa_processor = QuestionAnswerer()
98
+ bot_message = qa_processor.answer_question(
99
+ self.context,
100
+ self.context_page_num,
101
+ self.context_file_name,
102
+ self.history,
103
+ )
104
+ print(f"bot_message: {bot_message}")
105
+ response = [
106
+ [user_message, bot_message],
107
+ ]
108
+ self.history[-1] = response[0]
109
+ return self.history
110
+
 
 
 
111
  def __get_index_file(self, user_message):
112
+ user_message_embedding = openai.Embedding.create(
113
+ input=user_message, engine="text-embedding-ada-002"
114
+ )["data"][0]["embedding"]
115
+
116
+ self.knowledge_base["distance"] = distances_from_embeddings(
117
+ user_message_embedding,
118
+ self.knowledge_base["page_embedding"].values,
119
+ distance_metric="cosine",
120
+ )
121
+ self.knowledge_base = self.knowledge_base.sort_values(
122
+ by="distance", ascending=True
123
+ )
124
 
125
+ if self.knowledge_base["distance"].values[0] > 0.2:
126
  self.context = None
127
  else:
128
+ self.context = self.knowledge_base["page_content"].values[0]
129
+ self.context_page_num = self.knowledge_base["page_num"].values[0]
130
+ self.context_file_name = self.knowledge_base["file_name"].values[0]
utils/docx_processor.py CHANGED
@@ -6,13 +6,14 @@ import docx2txt
6
 
7
  from gpt_processor import Translator
8
 
 
9
  class DOCXProcessor:
10
  def __init__(self, file_path: str) -> None:
11
  self.file_path = file_path
12
  self.file_info = {
13
- 'file_name': self.file_path.split('/')[-1],
14
- 'file_format': 'DOCX',
15
- 'file_full_content': '',
16
  }
17
  self.__build_info()
18
 
@@ -20,21 +21,24 @@ class DOCXProcessor:
20
  try:
21
  text = docx2txt.process(self.file_path)
22
  text = unicodedata.normalize("NFKD", text)
23
- text = text.replace('\n', ' ').replace('\r', '')
24
- text = re.sub(' +', ' ', text)
25
- self.file_info['is_chinese'] = self.__is_chinese(text)
26
 
27
  tranlator = Translator()
28
- self.file_info['file_full_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
 
 
 
 
29
 
30
-
31
  except FileNotFoundError:
32
  print(f"File not found: {self.file_path}")
33
  except Exception as e:
34
  print(f"An error occurred: {str(e)}")
35
-
36
  def __is_chinese(self, text: str) -> bool:
37
  for char in text:
38
- if char >= '\u4e00' and char <= '\u9fff':
39
  return True
40
- return False
 
6
 
7
  from gpt_processor import Translator
8
 
9
+
10
  class DOCXProcessor:
11
  def __init__(self, file_path: str) -> None:
12
  self.file_path = file_path
13
  self.file_info = {
14
+ "file_name": self.file_path.split("/")[-1],
15
+ "file_format": "DOCX",
16
+ "file_full_content": "",
17
  }
18
  self.__build_info()
19
 
 
21
  try:
22
  text = docx2txt.process(self.file_path)
23
  text = unicodedata.normalize("NFKD", text)
24
+ text = text.replace("\n", " ").replace("\r", "")
25
+ text = re.sub(" +", " ", text)
26
+ self.file_info["is_chinese"] = self.__is_chinese(text)
27
 
28
  tranlator = Translator()
29
+ self.file_info["file_full_content"] = (
30
+ tranlator.translate_to_chinese(text)
31
+ if not self.file_info["is_chinese"]
32
+ else text
33
+ )
34
 
 
35
  except FileNotFoundError:
36
  print(f"File not found: {self.file_path}")
37
  except Exception as e:
38
  print(f"An error occurred: {str(e)}")
39
+
40
  def __is_chinese(self, text: str) -> bool:
41
  for char in text:
42
+ if char >= "\u4e00" and char <= "\u9fff":
43
  return True
44
+ return False
utils/gpt_processor.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ from typing import List
5
+
6
+ from opencc import OpenCC
7
+
8
+ import openai
9
+ import tiktoken
10
+
11
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
12
+
13
+
14
+ class GPTAgent:
15
+ def __init__(self, model):
16
+ openai.api_key = OPENAI_API_KEY
17
+ self.model = model
18
+ self.temperature = 0.8
19
+ self.frequency_penalty = 0
20
+ self.presence_penalty = 0.6
21
+ self.max_tokens = 2048
22
+ self.split_max_tokens = 13000
23
+
24
+ def request(self, messages):
25
+ response = self.agent.complete(messages=messages)
26
+ return response.choices[0].message["content"]
27
+
28
+ def split_into_many(self, text) -> List[str]:
29
+ tokenizer = tiktoken.get_encoding("cl100k_base")
30
+ # Split the text into sentences
31
+ sentences = text.split("。")
32
+
33
+ # Get the number of tokens for each sentence
34
+ n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
35
+
36
+ chunks = []
37
+ tokens_so_far = 0
38
+ chunk = []
39
+
40
+ # Loop through the sentences and tokens joined together in a tuple
41
+ for sentence, token in zip(sentences, n_tokens):
42
+ # If the number of tokens so far plus the number of tokens in the current sentence is greater
43
+ # than the max number of tokens, then add the chunk to the list of chunks and reset
44
+ # the chunk and tokens so far
45
+ if tokens_so_far + token > self.split_max_tokens:
46
+ chunks.append("。".join(chunk) + "。")
47
+ chunk = []
48
+ tokens_so_far = 0
49
+
50
+ # If the number of tokens in the current sentence is greater than the max number of
51
+ # tokens, go to the next sentence
52
+ if token > self.split_max_tokens:
53
+ continue
54
+
55
+ # Otherwise, add the sentence to the chunk and add the number of tokens to the total
56
+ chunk.append(sentence)
57
+ tokens_so_far += token + 1
58
+
59
+ # if the length of the text is less than the max number of tokens, then return the text
60
+ return [text] if len(chunks) == 0 else chunks
61
+
62
+ def preprocess(self, text):
63
+ text = text.replace("\n", " ").replace("\r", "")
64
+ return text
65
+
66
+ def parse_result(self, result):
67
+ parsed_result = []
68
+ chinese_converter = OpenCC("s2tw")
69
+ for i in range(len(result)):
70
+ result[i] = result[i].split(",")
71
+ if len(result[i]) == 1:
72
+ result[i] = result[i][0].split("、")
73
+ if len(result[i]) == 1:
74
+ result[i] = result[i][0].split(",")
75
+ for word in result[i]:
76
+ try:
77
+ parsed_result.append(
78
+ chinese_converter.convert(word).strip().replace("。", "")
79
+ )
80
+ except Exception as e:
81
+ logging.error(e)
82
+ logging.error("Failed to parse result")
83
+ return parsed_result
84
+
85
+
86
+ class Translator(GPTAgent):
87
+ def __init__(self):
88
+ super().__init__("gpt-3.5-turbo")
89
+
90
+ def translate_to_chinese(self, text):
91
+ system_prompt = """
92
+ I want you to act as an Chinese translator, spelling corrector and improver.
93
+ I will speak to you in English, translate it and answer in the corrected and improved version of my text, in Traditional Chinese.
94
+ Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations and DO NOT use any Simplified Chinese.
95
+ """
96
+ system_prompt_zh_tw = """
97
+ 我希望你擔任中文翻譯、拼寫糾正及改進的角色。
98
+ 我將用英文與你交流,請將其翻譯並用繁體中文回答,同時對我的文本進行糾正和改進。
99
+ 保持原意不變,但使其更具文學性。我希望你僅回覆更正、改進的部分,不要寫解釋,也不要使用任何简体中文。
100
+ """
101
+ messages = [
102
+ {"role": "system", "content": f"{system_prompt_zh_tw}"},
103
+ {"role": "user", "content": text},
104
+ ]
105
+ try:
106
+ response = openai.ChatCompletion.create(
107
+ model=self.model,
108
+ messages=messages,
109
+ temperature=self.temperature,
110
+ frequency_penalty=self.frequency_penalty,
111
+ presence_penalty=self.presence_penalty,
112
+ )
113
+ except Exception as e:
114
+ logging.error(e)
115
+ logging.error("Failed to translate to Chinese")
116
+
117
+ # translate from simplified chinese to traditional chinese
118
+ chinese_converter = OpenCC("s2tw")
119
+ return chinese_converter.convert(
120
+ response["choices"][0]["message"]["content"].strip()
121
+ )
122
+
123
+
124
+ class EmbeddingGenerator(GPTAgent):
125
+ def __init__(self):
126
+ super().__init__("text-davinci-002")
127
+
128
+ def get_embedding(self, text):
129
+ return openai.Embedding.create(input=text, engine="text-embedding-ada-002")[
130
+ "data"
131
+ ][0]["embedding"]
132
+
133
+
134
+ class KeywordsGenerator(GPTAgent):
135
+ def __init__(self):
136
+ super().__init__("gpt-3.5-turbo")
137
+
138
+ def extract_keywords(self, text):
139
+ system_prompt = """
140
+ 請你為以下內容抓出 5 個關鍵字用以搜尋這篇文章,並用「,」來分隔
141
+ """
142
+ text_chunks = self.split_into_many(text)
143
+ keywords = []
144
+ for i in range(len(text_chunks)):
145
+ text = text_chunks[i]
146
+ messages = [
147
+ {"role": "system", "content": f"{system_prompt}"},
148
+ {"role": "user", "content": f"{self.preprocess(text)}"},
149
+ ]
150
+ try:
151
+ response = openai.ChatCompletion.create(
152
+ model=self.model,
153
+ messages=messages,
154
+ temperature=0,
155
+ max_tokens=self.max_tokens,
156
+ frequency_penalty=self.frequency_penalty,
157
+ presence_penalty=self.presence_penalty,
158
+ )
159
+ keywords.append(response["choices"][0]["message"]["content"].strip())
160
+ except Exception as e:
161
+ logging.error(e)
162
+ logging.error("Failed to extract keywords")
163
+ return self.parse_result(keywords)
164
+
165
+
166
+ class TopicsGenerator(GPTAgent):
167
+ def __init__(self):
168
+ super().__init__("gpt-3.5-turbo")
169
+
170
+ def extract_topics(self, text):
171
+ system_prompt = """
172
+ 請你為以下內容給予 3 個高度抽象的主題分類這篇文章,並用「,」來分隔
173
+ """
174
+ text_chunks = self.split_into_many(text)
175
+ topics = []
176
+ for i in range(len(text_chunks)):
177
+ text = text_chunks[i]
178
+ messages = [
179
+ {"role": "system", "content": f"{system_prompt}"},
180
+ {"role": "user", "content": f"{self.preprocess(text)}"},
181
+ ]
182
+ try:
183
+ response = openai.ChatCompletion.create(
184
+ model=self.model,
185
+ messages=messages,
186
+ temperature=0,
187
+ max_tokens=self.max_tokens,
188
+ frequency_penalty=self.frequency_penalty,
189
+ presence_penalty=self.presence_penalty,
190
+ )
191
+ topics.append(response["choices"][0]["message"]["content"].strip())
192
+ except Exception as e:
193
+ logging.error(e)
194
+ logging.error("Failed to extract topics")
195
+ return self.parse_result(topics)
196
+
197
+
198
+ class Summarizer(GPTAgent):
199
+ def __init__(self):
200
+ super().__init__("gpt-3.5-turbo-16k")
201
+
202
+ def summarize(self, text):
203
+ system_prompt = """
204
+ 請幫我總結以下的文章。
205
+ """
206
+ messages = [
207
+ {"role": "system", "content": f"{system_prompt}"},
208
+ {"role": "user", "content": text},
209
+ ]
210
+ try:
211
+ response = openai.ChatCompletion.create(
212
+ model=self.model,
213
+ messages=messages,
214
+ temperature=self.temperature,
215
+ max_tokens=self.max_tokens,
216
+ frequency_penalty=self.frequency_penalty,
217
+ presence_penalty=self.presence_penalty,
218
+ )
219
+ except Exception as e:
220
+ logging.error(e)
221
+ logging.error("Failed to summarize")
222
+ chinese_converter = OpenCC("s2tw")
223
+ print(f'the summary is {response["choices"][0]["message"]["content"].strip()}')
224
+ response = chinese_converter.convert(
225
+ response["choices"][0]["message"]["content"]
226
+ )
227
+
228
+ return re.sub(r"\n+", "<br>", response)
229
+
230
+
231
+ class QuestionAnswerer(GPTAgent):
232
+ def __init__(self):
233
+ super().__init__("gpt-3.5-turbo-16k")
234
+
235
+ def answer_chunk_question(self, text, question):
236
+ system_prompt = """
237
+ 你是一個知識檢索系統,我會給你一份文件,請幫我依照文件內容回答問題,並用繁體中文回答。以下是文件內容
238
+ """
239
+ text_chunks = self.split_into_many(text)
240
+ answer_chunks = []
241
+ for i in range(len(text_chunks)):
242
+ text = text_chunks[i]
243
+ messages = [
244
+ {"role": "system", "content": f"{system_prompt} + '\n' '{text}'"},
245
+ {"role": "user", "content": f"{question}"},
246
+ ]
247
+ try:
248
+ response = openai.ChatCompletion.create(
249
+ model=self.model,
250
+ messages=messages,
251
+ temperature=self.temperature,
252
+ max_tokens=1024,
253
+ frequency_penalty=self.frequency_penalty,
254
+ presence_penalty=self.presence_penalty,
255
+ )
256
+ except Exception as e:
257
+ logging.error(e)
258
+ logging.error("Failed to answer question")
259
+ chinese_converter = OpenCC("s2tw")
260
+ answer_chunks.append(
261
+ chinese_converter.convert(
262
+ response["choices"][0]["message"]["content"].strip()
263
+ )
264
+ )
265
+
266
+ return "。".join(answer_chunks)
267
+
268
+ def answer_question(self, context, context_page_num, context_file_name, history):
269
+ system_prompt = """
270
+ 你是一個知識檢索系統,我會給你一份文件,請幫我依照文件內容回答問題,並用繁體中文回答。以下是文件內容
271
+ """
272
+
273
+ history = self.__construct_message_history(history)
274
+ messages = [
275
+ {"role": "system", "content": f"{system_prompt} + '\n' '''{context}'''"},
276
+ ] + history
277
+ try:
278
+ response = openai.ChatCompletion.create(
279
+ model=self.model,
280
+ messages=messages,
281
+ temperature=self.temperature,
282
+ max_tokens=2048,
283
+ frequency_penalty=self.frequency_penalty,
284
+ presence_penalty=self.presence_penalty,
285
+ )
286
+ chinese_converter = OpenCC("s2tw")
287
+ page_num_message = f"以下內容來自 {context_file_name},第 {context_page_num} 頁\n\n"
288
+ bot_answer = response["choices"][0]["message"]["content"]
289
+ whole_answer = page_num_message + bot_answer
290
+
291
+ return chinese_converter.convert(whole_answer)
292
+ except Exception as e:
293
+ logging.error(e)
294
+ logging.error("Failed to answer question")
295
+
296
+ def __construct_message_history(self, history):
297
+ print(f"history is {history}")
298
+ max_history_length = 10
299
+ if len(history) > max_history_length:
300
+ history = history[-max_history_length:]
301
+
302
+ messages = []
303
+ for i in range(len(history)):
304
+ messages.append({"role": "user", "content": history[i][0]})
305
+ if history[i][1] is not None:
306
+ messages.append({"role": "assistant", "content": history[i][1]})
307
+
308
+ return messages
utils/pdf_processor.py CHANGED
@@ -5,45 +5,48 @@ import logging
5
 
6
  from .gpt_processor import Translator
7
 
 
8
  class PDFProcessor:
9
  def __init__(self, file_path: str) -> None:
10
  self.file_path = file_path
11
  self.file_info = {
12
- 'file_name': self.file_path.split('/')[-1],
13
- 'file_format': 'PDF',
14
- 'total_pages': 0,
15
- 'file_content': {},
16
- 'file_full_content': '',
17
- 'is_chinese': '',
18
  }
19
  self.__build_info()
20
 
21
  def __build_info(self) -> None:
22
  try:
23
- with open(self.file_path, 'rb') as pdf_file:
24
  pdf_reader = PyPDF2.PdfReader(pdf_file)
25
  pages = len(pdf_reader.pages)
26
- self.file_info['total_pages'] = pages
27
  for i, page in enumerate(pdf_reader.pages):
28
  text = page.extract_text()
29
  text = unicodedata.normalize("NFKD", text)
30
- text = text.replace('\n', ' ').replace('\r', '')
31
- text = re.sub(' +', ' ', text)
32
- self.file_info['is_chinese'] = self.__is_chinese(text)
33
 
34
  page_info = {}
35
  logging.info(f"Processing page {i + 1}...")
36
- page_info['page_num'] = i + 1
37
- page_info['page_content'] = text
38
- self.file_info['file_content'][i + 1] = page_info
39
- self.file_info['file_full_content'] = self.file_info['file_full_content'] + page_info['page_content']
 
 
40
  except FileNotFoundError:
41
  print(f"File not found: {self.file_path}")
42
  except Exception as e:
43
  print(f"An error occurred: {str(e)}")
44
-
45
  def __is_chinese(self, text: str) -> bool:
46
  for char in text:
47
- if char >= '\u4e00' and char <= '\u9fff':
48
  return True
49
- return False
 
5
 
6
  from .gpt_processor import Translator
7
 
8
+
9
  class PDFProcessor:
10
  def __init__(self, file_path: str) -> None:
11
  self.file_path = file_path
12
  self.file_info = {
13
+ "file_name": self.file_path.split("/")[-1],
14
+ "file_format": "PDF",
15
+ "total_pages": 0,
16
+ "file_content": {},
17
+ "file_full_content": "",
18
+ "is_chinese": "",
19
  }
20
  self.__build_info()
21
 
22
  def __build_info(self) -> None:
23
  try:
24
+ with open(self.file_path, "rb") as pdf_file:
25
  pdf_reader = PyPDF2.PdfReader(pdf_file)
26
  pages = len(pdf_reader.pages)
27
+ self.file_info["total_pages"] = pages
28
  for i, page in enumerate(pdf_reader.pages):
29
  text = page.extract_text()
30
  text = unicodedata.normalize("NFKD", text)
31
+ text = text.replace("\n", " ").replace("\r", "")
32
+ text = re.sub(" +", " ", text)
33
+ self.file_info["is_chinese"] = self.__is_chinese(text)
34
 
35
  page_info = {}
36
  logging.info(f"Processing page {i + 1}...")
37
+ page_info["page_num"] = i + 1
38
+ page_info["page_content"] = text
39
+ self.file_info["file_content"][i + 1] = page_info
40
+ self.file_info["file_full_content"] = (
41
+ self.file_info["file_full_content"] + page_info["page_content"]
42
+ )
43
  except FileNotFoundError:
44
  print(f"File not found: {self.file_path}")
45
  except Exception as e:
46
  print(f"An error occurred: {str(e)}")
47
+
48
  def __is_chinese(self, text: str) -> bool:
49
  for char in text:
50
+ if char >= "\u4e00" and char <= "\u9fff":
51
  return True
52
+ return False
utils/utils.py CHANGED
@@ -1,21 +1,26 @@
1
-
2
  def clear_state(chatbot, *args):
3
  return chatbot.clear_state(*args)
4
 
 
5
  def send_system_nofification(chatbot, *args):
6
  return chatbot.send_system_nofification(*args)
7
 
 
8
  def build_knowledge_base(chatbot, *args):
9
  return chatbot.build_knowledge_base(*args)
10
 
 
11
  def change_md(chatbot, *args):
12
  return chatbot.change_md(*args)
13
 
 
14
  def get_index_file(chatbot, *args):
15
  return chatbot.get_index_file(*args)
16
 
 
17
  def user(chatbot, *args):
18
  return chatbot.user(*args)
19
 
 
20
  def bot(chatbot, *args):
21
- return chatbot.bot(*args)
 
 
1
  def clear_state(chatbot, *args):
2
  return chatbot.clear_state(*args)
3
 
4
+
5
  def send_system_nofification(chatbot, *args):
6
  return chatbot.send_system_nofification(*args)
7
 
8
+
9
  def build_knowledge_base(chatbot, *args):
10
  return chatbot.build_knowledge_base(*args)
11
 
12
+
13
  def change_md(chatbot, *args):
14
  return chatbot.change_md(*args)
15
 
16
+
17
  def get_index_file(chatbot, *args):
18
  return chatbot.get_index_file(*args)
19
 
20
+
21
  def user(chatbot, *args):
22
  return chatbot.user(*args)
23
 
24
+
25
  def bot(chatbot, *args):
26
+ return chatbot.bot(*args)
utils/work_flow_controller.py CHANGED
@@ -5,15 +5,21 @@ import hashlib
5
 
6
  import pandas as pd
7
 
8
- from .gpt_processor import (EmbeddingGenerator, KeywordsGenerator, Summarizer,
9
- TopicsGenerator, Translator)
 
 
 
 
 
10
  from .pdf_processor import PDFProcessor
11
 
12
  processors = {
13
- 'pdf': PDFProcessor,
14
  }
15
 
16
- class WorkFlowController():
 
17
  def __init__(self, file_src) -> None:
18
  # check if the file_path is list
19
  # self.file_paths = self.__get_file_name(file_src)
@@ -24,8 +30,8 @@ class WorkFlowController():
24
  self.files_info = {}
25
 
26
  for file_path in self.file_paths:
27
- file_name = file_path.split('/')[-1]
28
- file_format = file_path.split('.')[-1]
29
  self.file_processor = processors[file_format]
30
  file = self.file_processor(file_path).file_info
31
  file = self.__process_file(file)
@@ -34,24 +40,25 @@ class WorkFlowController():
34
  self.__dump_to_json()
35
  self.__dump_to_csv()
36
 
37
-
38
  def __get_summary(self, file: dict):
39
  # get summary from file content
40
-
41
  summarizer = Summarizer()
42
- file['summarized_content'] = summarizer.summarize(file['file_full_content'])
43
  return file
44
 
45
  def __get_keywords(self, file: dict):
46
  # get keywords from file content
47
  keywords_generator = KeywordsGenerator()
48
- file['keywords'] = keywords_generator.extract_keywords(file['file_full_content'])
 
 
49
  return file
50
 
51
  def __get_topics(self, file: dict):
52
  # get topics from file content
53
  topics_generator = TopicsGenerator()
54
- file['topics'] = topics_generator.extract_topics(file['file_full_content'])
55
  return file
56
 
57
  def __get_embedding(self, file):
@@ -59,41 +66,54 @@ class WorkFlowController():
59
  # return embedding
60
  embedding_generator = EmbeddingGenerator()
61
 
62
- for i, _ in enumerate(file['file_content']):
63
  # use i+1 to meet the index of file_content
64
- file['file_content'][i+1]['page_embedding'] = embedding_generator.get_embedding(file['file_content'][i+1]['page_content'])
 
 
 
 
65
  return file
66
-
67
 
68
  def __translate_to_chinese(self, file: dict):
69
  # translate file content to chinese
70
  translator = Translator()
71
  # reset the file full content
72
- file['file_full_content'] = ''
73
 
74
- for i, _ in enumerate(file['file_content']):
75
  # use i+1 to meet the index of file_content
76
- file['file_content'][i+1]['page_content'] = translator.translate_to_chinese(file['file_content'][i+1]['page_content'])
77
- file['file_full_content'] = file['file_full_content'] + file['file_content'][i+1]['page_content']
 
 
 
 
 
 
78
  return file
79
-
80
  def __process_file(self, file: dict):
81
  # process file content
82
  # return processed data
83
- if not file['is_chinese']:
84
  file = self.__translate_to_chinese(file)
85
  file = self.__get_embedding(file)
86
  file = self.__get_summary(file)
87
  return file
88
 
89
  def __dump_to_json(self):
90
- with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
91
- print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
92
- self.json_result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
 
 
 
 
 
93
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
94
 
95
  def __construct_knowledge_base_dataframe(self):
96
-
97
  rows = []
98
  for file_path, content in self.files_info.items():
99
  file_full_content = content["file_full_content"]
@@ -107,15 +127,24 @@ class WorkFlowController():
107
  }
108
  rows.append(row)
109
 
110
- columns = ["file_name", "page_num", "page_content", "page_embedding", "file_full_content"]
 
 
 
 
 
 
111
  df = pd.DataFrame(rows, columns=columns)
112
  return df
113
 
114
  def __dump_to_csv(self):
115
  df = self.__construct_knowledge_base_dataframe()
116
- df.to_csv(os.path.join(os.getcwd(), 'knowledge_base.csv'), index=False)
117
- print("Dumping to csv, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.csv'))
118
- self.csv_result_path = os.path.join(os.getcwd(), 'knowledge_base.csv')
 
 
 
119
 
120
  def __get_file_name(self, file_src):
121
  file_paths = [x.name for x in file_src]
@@ -127,4 +156,4 @@ class WorkFlowController():
127
  while chunk := f.read(8192):
128
  md5_hash.update(chunk)
129
 
130
- return md5_hash.hexdigest()
 
5
 
6
  import pandas as pd
7
 
8
+ from .gpt_processor import (
9
+ EmbeddingGenerator,
10
+ KeywordsGenerator,
11
+ Summarizer,
12
+ TopicsGenerator,
13
+ Translator,
14
+ )
15
  from .pdf_processor import PDFProcessor
16
 
17
  processors = {
18
+ "pdf": PDFProcessor,
19
  }
20
 
21
+
22
+ class WorkFlowController:
23
  def __init__(self, file_src) -> None:
24
  # check if the file_path is list
25
  # self.file_paths = self.__get_file_name(file_src)
 
30
  self.files_info = {}
31
 
32
  for file_path in self.file_paths:
33
+ file_name = file_path.split("/")[-1]
34
+ file_format = file_path.split(".")[-1]
35
  self.file_processor = processors[file_format]
36
  file = self.file_processor(file_path).file_info
37
  file = self.__process_file(file)
 
40
  self.__dump_to_json()
41
  self.__dump_to_csv()
42
 
 
43
  def __get_summary(self, file: dict):
44
  # get summary from file content
45
+
46
  summarizer = Summarizer()
47
+ file["summarized_content"] = summarizer.summarize(file["file_full_content"])
48
  return file
49
 
50
  def __get_keywords(self, file: dict):
51
  # get keywords from file content
52
  keywords_generator = KeywordsGenerator()
53
+ file["keywords"] = keywords_generator.extract_keywords(
54
+ file["file_full_content"]
55
+ )
56
  return file
57
 
58
  def __get_topics(self, file: dict):
59
  # get topics from file content
60
  topics_generator = TopicsGenerator()
61
+ file["topics"] = topics_generator.extract_topics(file["file_full_content"])
62
  return file
63
 
64
  def __get_embedding(self, file):
 
66
  # return embedding
67
  embedding_generator = EmbeddingGenerator()
68
 
69
+ for i, _ in enumerate(file["file_content"]):
70
  # use i+1 to meet the index of file_content
71
+ file["file_content"][i + 1][
72
+ "page_embedding"
73
+ ] = embedding_generator.get_embedding(
74
+ file["file_content"][i + 1]["page_content"]
75
+ )
76
  return file
 
77
 
78
  def __translate_to_chinese(self, file: dict):
79
  # translate file content to chinese
80
  translator = Translator()
81
  # reset the file full content
82
+ file["file_full_content"] = ""
83
 
84
+ for i, _ in enumerate(file["file_content"]):
85
  # use i+1 to meet the index of file_content
86
+ file["file_content"][i + 1][
87
+ "page_content"
88
+ ] = translator.translate_to_chinese(
89
+ file["file_content"][i + 1]["page_content"]
90
+ )
91
+ file["file_full_content"] = (
92
+ file["file_full_content"] + file["file_content"][i + 1]["page_content"]
93
+ )
94
  return file
95
+
96
  def __process_file(self, file: dict):
97
  # process file content
98
  # return processed data
99
+ if not file["is_chinese"]:
100
  file = self.__translate_to_chinese(file)
101
  file = self.__get_embedding(file)
102
  file = self.__get_summary(file)
103
  return file
104
 
105
  def __dump_to_json(self):
106
+ with open(
107
+ os.path.join(os.getcwd(), "knowledge_base.json"), "w", encoding="utf-8"
108
+ ) as f:
109
+ print(
110
+ "Dumping to json, the path is: "
111
+ + os.path.join(os.getcwd(), "knowledge_base.json")
112
+ )
113
+ self.json_result_path = os.path.join(os.getcwd(), "knowledge_base.json")
114
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
115
 
116
  def __construct_knowledge_base_dataframe(self):
 
117
  rows = []
118
  for file_path, content in self.files_info.items():
119
  file_full_content = content["file_full_content"]
 
127
  }
128
  rows.append(row)
129
 
130
+ columns = [
131
+ "file_name",
132
+ "page_num",
133
+ "page_content",
134
+ "page_embedding",
135
+ "file_full_content",
136
+ ]
137
  df = pd.DataFrame(rows, columns=columns)
138
  return df
139
 
140
  def __dump_to_csv(self):
141
  df = self.__construct_knowledge_base_dataframe()
142
+ df.to_csv(os.path.join(os.getcwd(), "knowledge_base.csv"), index=False)
143
+ print(
144
+ "Dumping to csv, the path is: "
145
+ + os.path.join(os.getcwd(), "knowledge_base.csv")
146
+ )
147
+ self.csv_result_path = os.path.join(os.getcwd(), "knowledge_base.csv")
148
 
149
  def __get_file_name(self, file_src):
150
  file_paths = [x.name for x in file_src]
 
156
  while chunk := f.read(8192):
157
  md5_hash.update(chunk)
158
 
159
+ return md5_hash.hexdigest()