ChenyuRabbitLove commited on
Commit
ca406a4
·
1 Parent(s): b95388b

fix/ format and modify __get_index_file sequence

Browse files
app.py CHANGED
@@ -1,35 +1,14 @@
1
- import json
2
- import time
3
- import random
4
- import os
5
-
6
- import openai
7
  import gradio as gr
8
- import pandas as pd
9
- import numpy as np
10
- from openai.embeddings_utils import distances_from_embeddings
11
 
12
- from utils.gpt_processor import QuestionAnswerer
13
- from utils.work_flow_controller import WorkFlowController
14
  from utils.chatbot import Chatbot
15
  from utils.utils import *
16
-
17
- def create_chatbot():
18
- bot = Chatbot()
19
- return bot
20
 
 
21
  with gr.Blocks() as demo:
22
- history = gr.State([])
23
- user_question = gr.State("")
24
- chatbot_utils = Chatbot()
25
-
26
  user_chatbot = gr.State(Chatbot())
27
-
28
- upload_state = gr.State("wating")
29
- finished = gr.State("finished")
30
 
31
  with gr.Row():
32
- gr.HTML('Junyi Academy Chatbot')
33
  with gr.Row(equal_height=True):
34
  with gr.Column(scale=5):
35
  with gr.Row():
@@ -48,48 +27,55 @@ with gr.Blocks() as demo:
48
  with gr.Column(min_width=70, scale=1):
49
  submit_btn = gr.Button("傳送")
50
 
51
- bot_args = dict(
52
- fn=bot,
53
- inputs=user_chatbot,
54
- outputs=chatbot,
55
- )
56
-
57
- user_args = dict(
58
- fn=user,
59
- inputs=[user_chatbot, user_input],
60
- outputs=[user_input, chatbot],
61
- queue=False,
62
- )
63
-
64
- response = user_input.submit(**user_args).then(**bot_args)
65
-
66
- response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
67
-
68
- submit_btn.click(user,
69
- [user_input, chatbot],
70
- [user_input, chatbot],
71
- chatbot,
72
- queue=False).then(**bot_args).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
73
-
74
-
75
-
76
  with gr.Row():
77
- index_file = gr.File(file_count="multiple", file_types=["pdf"], label="Upload PDF file")
78
-
 
 
79
  with gr.Row():
80
- instruction = gr.Markdown("""
 
81
  ## 使用說明
82
  1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
83
  2. 在上方輸入欄輸入問題,系統將自動回覆
84
  3. 可以根據下方的摘要內容來提問
85
  4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
86
- 5. 要切換檢索的文件,請點選「清除對話記錄」按鈕後再重新提問
87
- """)
 
88
 
89
  with gr.Row():
90
- describe = gr.Markdown('', visible=True)
 
 
91
 
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  clear_state_args = dict(
94
  fn=clear_state,
95
  inputs=user_chatbot,
@@ -98,6 +84,7 @@ with gr.Blocks() as demo:
98
 
99
  clear_btn.click(**clear_state_args)
100
 
 
101
  send_system_nofification_args = dict(
102
  fn=send_system_nofification,
103
  inputs=user_chatbot,
@@ -116,12 +103,13 @@ with gr.Blocks() as demo:
116
  outputs=[describe],
117
  )
118
 
119
- index_file.upload(**send_system_nofification_args) \
120
- .then(lambda: gr.update(interactive=True), None, None, queue=False) \
121
- .then(**bulid_knowledge_base_args) \
122
- .then(**send_system_nofification_args) \
123
- .then(lambda: gr.update(interactive=True), None, None, queue=False) \
124
- .then(**change_md_args)
125
-
 
126
  if __name__ == "__main__":
127
  demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
 
 
 
3
  from utils.chatbot import Chatbot
4
  from utils.utils import *
 
 
 
 
5
 
6
+ # start of gradio interface
7
  with gr.Blocks() as demo:
 
 
 
 
8
  user_chatbot = gr.State(Chatbot())
 
 
 
9
 
10
  with gr.Row():
11
+ gr.HTML("Junyi Academy Chatbot")
12
  with gr.Row(equal_height=True):
13
  with gr.Column(scale=5):
14
  with gr.Row():
 
27
  with gr.Column(min_width=70, scale=1):
28
  submit_btn = gr.Button("傳送")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with gr.Row():
31
+ index_file = gr.File(
32
+ file_count="multiple", file_types=["pdf"], label="Upload PDF file"
33
+ )
34
+
35
  with gr.Row():
36
+ instruction = gr.Markdown(
37
+ """
38
  ## 使用說明
39
  1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
40
  2. 在上方輸入欄輸入問題,系統將自動回覆
41
  3. 可以根據下方的摘要內容來提問
42
  4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
43
+ 5. 要切換檢索的文件,請點選「清除」按鈕後再重新提問
44
+ """
45
+ )
46
 
47
  with gr.Row():
48
+ describe = gr.Markdown("", visible=True)
49
+
50
+ # end of gradio interface
51
 
52
+ # start of workflow controller
53
+
54
+ # defining workflow of user bot interaction
55
+ bot_args = dict(
56
+ fn=bot,
57
+ inputs=user_chatbot,
58
+ outputs=chatbot,
59
+ )
60
+
61
+ user_args = dict(
62
+ fn=user,
63
+ inputs=[user_chatbot, user_input],
64
+ outputs=[user_input, chatbot],
65
+ queue=False,
66
+ )
67
+
68
+ response = user_input.submit(**user_args).then(**bot_args)
69
+
70
+ response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
71
+
72
+ submit_btn.click(
73
+ **user_args,
74
+ ).then(
75
+ **bot_args
76
+ ).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
77
+
78
+ # defining workflow of clear state
79
  clear_state_args = dict(
80
  fn=clear_state,
81
  inputs=user_chatbot,
 
84
 
85
  clear_btn.click(**clear_state_args)
86
 
87
+ # defining workflow of building knowledge base
88
  send_system_nofification_args = dict(
89
  fn=send_system_nofification,
90
  inputs=user_chatbot,
 
103
  outputs=[describe],
104
  )
105
 
106
+ index_file.upload(**send_system_nofification_args).then(
107
+ lambda: gr.update(interactive=True), None, None, queue=False
108
+ ).then(**bulid_knowledge_base_args).then(**send_system_nofification_args).then(
109
+ lambda: gr.update(interactive=True), None, None, queue=False
110
+ ).then(
111
+ **change_md_args
112
+ )
113
+
114
  if __name__ == "__main__":
115
  demo.launch()
utils/chatbot.py CHANGED
@@ -9,57 +9,59 @@ from openai.embeddings_utils import distances_from_embeddings
9
  from .work_flow_controller import WorkFlowController
10
  from .gpt_processor import QuestionAnswerer
11
 
12
- class Chatbot():
 
13
  def __init__(self) -> None:
14
  self.history = []
15
- self.upload_state = 'waiting'
16
-
17
  self.knowledge_base = None
18
  self.context = None
19
  self.context_page_num = None
20
  self.context_file_name = None
21
-
22
 
23
  def build_knowledge_base(self, files):
24
  work_flow_controller = WorkFlowController(files)
25
  self.csv_result_path = work_flow_controller.csv_result_path
26
  self.json_result_path = work_flow_controller.json_result_path
27
 
28
- with open(self.csv_result_path, 'r', encoding='UTF-8') as fp:
29
  knowledge_base = pd.read_csv(fp)
30
- knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
 
 
31
 
32
  self.knowledge_base = knowledge_base
33
- self.upload_state = 'done'
34
 
35
  def clear_state(self):
36
  self.context = None
37
  self.context_page_num = None
38
  self.context_file_name = None
39
- self.upload_state = 'waiting'
40
  self.history = []
41
 
42
  def send_system_nofification(self):
43
- if self.upload_state == 'waiting':
44
- conversation = [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
45
  return conversation
46
- elif self.upload_state == 'done':
47
- conversation = [['已上傳文件', '文件處理完成,請開始提問']]
48
  return conversation
49
-
50
  def change_md(self):
51
  content = self.__construct_summary()
52
  return gr.Markdown.update(content, visible=True)
53
-
54
  def __construct_summary(self):
55
- with open(self.json_result_path, 'r', encoding='UTF-8') as fp:
56
  knowledge_base = json.load(fp)
57
 
58
  context = """"""
59
  for key in knowledge_base.keys():
60
- file_name = knowledge_base[key]['file_name']
61
- total_page = knowledge_base[key]['total_pages']
62
- summary = knowledge_base[key]['summarized_content']
63
  file_context = f"""
64
  ### 文件摘要
65
  {file_name} (共 {total_page} 頁)<br><br>
@@ -67,14 +69,14 @@ class Chatbot():
67
  """
68
  context += file_context
69
  return context
70
-
71
  def user(self, message):
72
  self.history += [[message, None]]
73
  return "", self.history
74
-
75
  def bot(self):
76
  user_message = self.history[-1][0]
77
- print(f'user_message: {user_message}')
78
 
79
  if self.knowledge_base is None:
80
  response = [
@@ -82,41 +84,47 @@ class Chatbot():
82
  ]
83
  self.history = response
84
  return self.history
85
- elif self.context is None:
 
86
  self.__get_index_file(user_message)
87
- print(f'CONTEXT: {self.context}')
88
  if self.context is None:
89
  response = [
90
  [user_message, "無法找到相關文件,請重新提問"],
91
  ]
92
  self.history = response
93
  return self.history
94
- else:
95
- pass
96
-
97
- if self.context is not None:
98
- qa_processor = QuestionAnswerer()
99
- bot_message = qa_processor.answer_question(
100
- self.context,
101
- self.context_page_num,
102
- self.context_file_name,
103
- self.history
104
- )
105
- print(f'bot_message: {bot_message}')
106
- response = [
107
- [user_message, bot_message],
108
- ]
109
- self.history[-1] = response[0]
110
- return self.history
111
-
112
  def __get_index_file(self, user_message):
113
- user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
114
- self.knowledge_base['distance'] = distances_from_embeddings(user_message_embedding, self.knowledge_base['page_embedding'].values, distance_metric='cosine')
115
- self.knowledge_base = self.knowledge_base.sort_values(by='distance', ascending=True).head(1)
 
 
 
 
 
 
 
 
 
116
 
117
- if self.knowledge_base['distance'].values[0] > 0.2:
118
  self.context = None
119
  else:
120
- self.context = self.knowledge_base['page_content'].values[0]
121
- self.context_page_num = self.knowledge_base['page_num'].values[0]
122
- self.context_file_name = self.knowledge_base['file_name'].values[0]
 
9
  from .work_flow_controller import WorkFlowController
10
  from .gpt_processor import QuestionAnswerer
11
 
12
+
13
+ class Chatbot:
14
  def __init__(self) -> None:
15
  self.history = []
16
+ self.upload_state = "waiting"
17
+
18
  self.knowledge_base = None
19
  self.context = None
20
  self.context_page_num = None
21
  self.context_file_name = None
 
22
 
23
  def build_knowledge_base(self, files):
24
  work_flow_controller = WorkFlowController(files)
25
  self.csv_result_path = work_flow_controller.csv_result_path
26
  self.json_result_path = work_flow_controller.json_result_path
27
 
28
+ with open(self.csv_result_path, "r", encoding="UTF-8") as fp:
29
  knowledge_base = pd.read_csv(fp)
30
+ knowledge_base["page_embedding"] = (
31
+ knowledge_base["page_embedding"].apply(eval).apply(np.array)
32
+ )
33
 
34
  self.knowledge_base = knowledge_base
35
+ self.upload_state = "done"
36
 
37
  def clear_state(self):
38
  self.context = None
39
  self.context_page_num = None
40
  self.context_file_name = None
41
+ self.upload_state = "waiting"
42
  self.history = []
43
 
44
  def send_system_nofification(self):
45
+ if self.upload_state == "waiting":
46
+ conversation = [["已上傳文件", "文件處理中(摘要、翻譯等),結束後將自動回覆"]]
47
  return conversation
48
+ elif self.upload_state == "done":
49
+ conversation = [["已上傳文件", "文件處理完成,請開始提問"]]
50
  return conversation
51
+
52
  def change_md(self):
53
  content = self.__construct_summary()
54
  return gr.Markdown.update(content, visible=True)
55
+
56
  def __construct_summary(self):
57
+ with open(self.json_result_path, "r", encoding="UTF-8") as fp:
58
  knowledge_base = json.load(fp)
59
 
60
  context = """"""
61
  for key in knowledge_base.keys():
62
+ file_name = knowledge_base[key]["file_name"]
63
+ total_page = knowledge_base[key]["total_pages"]
64
+ summary = knowledge_base[key]["summarized_content"]
65
  file_context = f"""
66
  ### 文件摘要
67
  {file_name} (共 {total_page} 頁)<br><br>
 
69
  """
70
  context += file_context
71
  return context
72
+
73
  def user(self, message):
74
  self.history += [[message, None]]
75
  return "", self.history
76
+
77
  def bot(self):
78
  user_message = self.history[-1][0]
79
+ print(f"user_message: {user_message}")
80
 
81
  if self.knowledge_base is None:
82
  response = [
 
84
  ]
85
  self.history = response
86
  return self.history
87
+
88
+ else:
89
  self.__get_index_file(user_message)
 
90
  if self.context is None:
91
  response = [
92
  [user_message, "無法找到相關文件,請重新提問"],
93
  ]
94
  self.history = response
95
  return self.history
96
+ else:
97
+ qa_processor = QuestionAnswerer()
98
+ bot_message = qa_processor.answer_question(
99
+ self.context,
100
+ self.context_page_num,
101
+ self.context_file_name,
102
+ self.history,
103
+ )
104
+ print(f"bot_message: {bot_message}")
105
+ response = [
106
+ [user_message, bot_message],
107
+ ]
108
+ self.history[-1] = response[0]
109
+ return self.history
110
+
 
 
 
111
  def __get_index_file(self, user_message):
112
+ user_message_embedding = openai.Embedding.create(
113
+ input=user_message, engine="text-embedding-ada-002"
114
+ )["data"][0]["embedding"]
115
+
116
+ self.knowledge_base["distance"] = distances_from_embeddings(
117
+ user_message_embedding,
118
+ self.knowledge_base["page_embedding"].values,
119
+ distance_metric="cosine",
120
+ )
121
+ self.knowledge_base = self.knowledge_base.sort_values(
122
+ by="distance", ascending=True
123
+ )
124
 
125
+ if self.knowledge_base["distance"].values[0] > 0.2:
126
  self.context = None
127
  else:
128
+ self.context = self.knowledge_base["page_content"].values[0]
129
+ self.context_page_num = self.knowledge_base["page_num"].values[0]
130
+ self.context_file_name = self.knowledge_base["file_name"].values[0]
utils/docx_processor.py CHANGED
@@ -6,13 +6,14 @@ import docx2txt
6
 
7
  from gpt_processor import Translator
8
 
 
9
  class DOCXProcessor:
10
  def __init__(self, file_path: str) -> None:
11
  self.file_path = file_path
12
  self.file_info = {
13
- 'file_name': self.file_path.split('/')[-1],
14
- 'file_format': 'DOCX',
15
- 'file_full_content': '',
16
  }
17
  self.__build_info()
18
 
@@ -20,21 +21,24 @@ class DOCXProcessor:
20
  try:
21
  text = docx2txt.process(self.file_path)
22
  text = unicodedata.normalize("NFKD", text)
23
- text = text.replace('\n', ' ').replace('\r', '')
24
- text = re.sub(' +', ' ', text)
25
- self.file_info['is_chinese'] = self.__is_chinese(text)
26
 
27
  tranlator = Translator()
28
- self.file_info['file_full_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
 
 
 
 
29
 
30
-
31
  except FileNotFoundError:
32
  print(f"File not found: {self.file_path}")
33
  except Exception as e:
34
  print(f"An error occurred: {str(e)}")
35
-
36
  def __is_chinese(self, text: str) -> bool:
37
  for char in text:
38
- if char >= '\u4e00' and char <= '\u9fff':
39
  return True
40
- return False
 
6
 
7
  from gpt_processor import Translator
8
 
9
+
10
  class DOCXProcessor:
11
  def __init__(self, file_path: str) -> None:
12
  self.file_path = file_path
13
  self.file_info = {
14
+ "file_name": self.file_path.split("/")[-1],
15
+ "file_format": "DOCX",
16
+ "file_full_content": "",
17
  }
18
  self.__build_info()
19
 
 
21
  try:
22
  text = docx2txt.process(self.file_path)
23
  text = unicodedata.normalize("NFKD", text)
24
+ text = text.replace("\n", " ").replace("\r", "")
25
+ text = re.sub(" +", " ", text)
26
+ self.file_info["is_chinese"] = self.__is_chinese(text)
27
 
28
  tranlator = Translator()
29
+ self.file_info["file_full_content"] = (
30
+ tranlator.translate_to_chinese(text)
31
+ if not self.file_info["is_chinese"]
32
+ else text
33
+ )
34
 
 
35
  except FileNotFoundError:
36
  print(f"File not found: {self.file_path}")
37
  except Exception as e:
38
  print(f"An error occurred: {str(e)}")
39
+
40
  def __is_chinese(self, text: str) -> bool:
41
  for char in text:
42
+ if char >= "\u4e00" and char <= "\u9fff":
43
  return True
44
+ return False
utils/pdf_processor.py CHANGED
@@ -5,45 +5,48 @@ import logging
5
 
6
  from .gpt_processor import Translator
7
 
 
8
  class PDFProcessor:
9
  def __init__(self, file_path: str) -> None:
10
  self.file_path = file_path
11
  self.file_info = {
12
- 'file_name': self.file_path.split('/')[-1],
13
- 'file_format': 'PDF',
14
- 'total_pages': 0,
15
- 'file_content': {},
16
- 'file_full_content': '',
17
- 'is_chinese': '',
18
  }
19
  self.__build_info()
20
 
21
  def __build_info(self) -> None:
22
  try:
23
- with open(self.file_path, 'rb') as pdf_file:
24
  pdf_reader = PyPDF2.PdfReader(pdf_file)
25
  pages = len(pdf_reader.pages)
26
- self.file_info['total_pages'] = pages
27
  for i, page in enumerate(pdf_reader.pages):
28
  text = page.extract_text()
29
  text = unicodedata.normalize("NFKD", text)
30
- text = text.replace('\n', ' ').replace('\r', '')
31
- text = re.sub(' +', ' ', text)
32
- self.file_info['is_chinese'] = self.__is_chinese(text)
33
 
34
  page_info = {}
35
  logging.info(f"Processing page {i + 1}...")
36
- page_info['page_num'] = i + 1
37
- page_info['page_content'] = text
38
- self.file_info['file_content'][i + 1] = page_info
39
- self.file_info['file_full_content'] = self.file_info['file_full_content'] + page_info['page_content']
 
 
40
  except FileNotFoundError:
41
  print(f"File not found: {self.file_path}")
42
  except Exception as e:
43
  print(f"An error occurred: {str(e)}")
44
-
45
  def __is_chinese(self, text: str) -> bool:
46
  for char in text:
47
- if char >= '\u4e00' and char <= '\u9fff':
48
  return True
49
- return False
 
5
 
6
  from .gpt_processor import Translator
7
 
8
+
9
  class PDFProcessor:
10
  def __init__(self, file_path: str) -> None:
11
  self.file_path = file_path
12
  self.file_info = {
13
+ "file_name": self.file_path.split("/")[-1],
14
+ "file_format": "PDF",
15
+ "total_pages": 0,
16
+ "file_content": {},
17
+ "file_full_content": "",
18
+ "is_chinese": "",
19
  }
20
  self.__build_info()
21
 
22
  def __build_info(self) -> None:
23
  try:
24
+ with open(self.file_path, "rb") as pdf_file:
25
  pdf_reader = PyPDF2.PdfReader(pdf_file)
26
  pages = len(pdf_reader.pages)
27
+ self.file_info["total_pages"] = pages
28
  for i, page in enumerate(pdf_reader.pages):
29
  text = page.extract_text()
30
  text = unicodedata.normalize("NFKD", text)
31
+ text = text.replace("\n", " ").replace("\r", "")
32
+ text = re.sub(" +", " ", text)
33
+ self.file_info["is_chinese"] = self.__is_chinese(text)
34
 
35
  page_info = {}
36
  logging.info(f"Processing page {i + 1}...")
37
+ page_info["page_num"] = i + 1
38
+ page_info["page_content"] = text
39
+ self.file_info["file_content"][i + 1] = page_info
40
+ self.file_info["file_full_content"] = (
41
+ self.file_info["file_full_content"] + page_info["page_content"]
42
+ )
43
  except FileNotFoundError:
44
  print(f"File not found: {self.file_path}")
45
  except Exception as e:
46
  print(f"An error occurred: {str(e)}")
47
+
48
  def __is_chinese(self, text: str) -> bool:
49
  for char in text:
50
+ if char >= "\u4e00" and char <= "\u9fff":
51
  return True
52
+ return False
utils/utils.py CHANGED
@@ -1,21 +1,26 @@
1
-
2
  def clear_state(chatbot, *args):
3
  return chatbot.clear_state(*args)
4
 
 
5
  def send_system_nofification(chatbot, *args):
6
  return chatbot.send_system_nofification(*args)
7
 
 
8
  def build_knowledge_base(chatbot, *args):
9
  return chatbot.build_knowledge_base(*args)
10
 
 
11
  def change_md(chatbot, *args):
12
  return chatbot.change_md(*args)
13
 
 
14
  def get_index_file(chatbot, *args):
15
  return chatbot.get_index_file(*args)
16
 
 
17
  def user(chatbot, *args):
18
  return chatbot.user(*args)
19
 
 
20
  def bot(chatbot, *args):
21
- return chatbot.bot(*args)
 
 
1
  def clear_state(chatbot, *args):
2
  return chatbot.clear_state(*args)
3
 
4
+
5
  def send_system_nofification(chatbot, *args):
6
  return chatbot.send_system_nofification(*args)
7
 
8
+
9
  def build_knowledge_base(chatbot, *args):
10
  return chatbot.build_knowledge_base(*args)
11
 
12
+
13
  def change_md(chatbot, *args):
14
  return chatbot.change_md(*args)
15
 
16
+
17
  def get_index_file(chatbot, *args):
18
  return chatbot.get_index_file(*args)
19
 
20
+
21
  def user(chatbot, *args):
22
  return chatbot.user(*args)
23
 
24
+
25
  def bot(chatbot, *args):
26
+ return chatbot.bot(*args)
utils/work_flow_controller.py CHANGED
@@ -5,15 +5,21 @@ import hashlib
5
 
6
  import pandas as pd
7
 
8
- from .gpt_processor import (EmbeddingGenerator, KeywordsGenerator, Summarizer,
9
- TopicsGenerator, Translator)
 
 
 
 
 
10
  from .pdf_processor import PDFProcessor
11
 
12
  processors = {
13
- 'pdf': PDFProcessor,
14
  }
15
 
16
- class WorkFlowController():
 
17
  def __init__(self, file_src) -> None:
18
  # check if the file_path is list
19
  # self.file_paths = self.__get_file_name(file_src)
@@ -24,8 +30,8 @@ class WorkFlowController():
24
  self.files_info = {}
25
 
26
  for file_path in self.file_paths:
27
- file_name = file_path.split('/')[-1]
28
- file_format = file_path.split('.')[-1]
29
  self.file_processor = processors[file_format]
30
  file = self.file_processor(file_path).file_info
31
  file = self.__process_file(file)
@@ -34,24 +40,25 @@ class WorkFlowController():
34
  self.__dump_to_json()
35
  self.__dump_to_csv()
36
 
37
-
38
  def __get_summary(self, file: dict):
39
  # get summary from file content
40
-
41
  summarizer = Summarizer()
42
- file['summarized_content'] = summarizer.summarize(file['file_full_content'])
43
  return file
44
 
45
  def __get_keywords(self, file: dict):
46
  # get keywords from file content
47
  keywords_generator = KeywordsGenerator()
48
- file['keywords'] = keywords_generator.extract_keywords(file['file_full_content'])
 
 
49
  return file
50
 
51
  def __get_topics(self, file: dict):
52
  # get topics from file content
53
  topics_generator = TopicsGenerator()
54
- file['topics'] = topics_generator.extract_topics(file['file_full_content'])
55
  return file
56
 
57
  def __get_embedding(self, file):
@@ -59,41 +66,54 @@ class WorkFlowController():
59
  # return embedding
60
  embedding_generator = EmbeddingGenerator()
61
 
62
- for i, _ in enumerate(file['file_content']):
63
  # use i+1 to meet the index of file_content
64
- file['file_content'][i+1]['page_embedding'] = embedding_generator.get_embedding(file['file_content'][i+1]['page_content'])
 
 
 
 
65
  return file
66
-
67
 
68
  def __translate_to_chinese(self, file: dict):
69
  # translate file content to chinese
70
  translator = Translator()
71
  # reset the file full content
72
- file['file_full_content'] = ''
73
 
74
- for i, _ in enumerate(file['file_content']):
75
  # use i+1 to meet the index of file_content
76
- file['file_content'][i+1]['page_content'] = translator.translate_to_chinese(file['file_content'][i+1]['page_content'])
77
- file['file_full_content'] = file['file_full_content'] + file['file_content'][i+1]['page_content']
 
 
 
 
 
 
78
  return file
79
-
80
  def __process_file(self, file: dict):
81
  # process file content
82
  # return processed data
83
- if not file['is_chinese']:
84
  file = self.__translate_to_chinese(file)
85
  file = self.__get_embedding(file)
86
  file = self.__get_summary(file)
87
  return file
88
 
89
  def __dump_to_json(self):
90
- with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
91
- print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
92
- self.json_result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
 
 
 
 
 
93
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
94
 
95
  def __construct_knowledge_base_dataframe(self):
96
-
97
  rows = []
98
  for file_path, content in self.files_info.items():
99
  file_full_content = content["file_full_content"]
@@ -107,15 +127,24 @@ class WorkFlowController():
107
  }
108
  rows.append(row)
109
 
110
- columns = ["file_name", "page_num", "page_content", "page_embedding", "file_full_content"]
 
 
 
 
 
 
111
  df = pd.DataFrame(rows, columns=columns)
112
  return df
113
 
114
  def __dump_to_csv(self):
115
  df = self.__construct_knowledge_base_dataframe()
116
- df.to_csv(os.path.join(os.getcwd(), 'knowledge_base.csv'), index=False)
117
- print("Dumping to csv, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.csv'))
118
- self.csv_result_path = os.path.join(os.getcwd(), 'knowledge_base.csv')
 
 
 
119
 
120
  def __get_file_name(self, file_src):
121
  file_paths = [x.name for x in file_src]
@@ -127,4 +156,4 @@ class WorkFlowController():
127
  while chunk := f.read(8192):
128
  md5_hash.update(chunk)
129
 
130
- return md5_hash.hexdigest()
 
5
 
6
  import pandas as pd
7
 
8
+ from .gpt_processor import (
9
+ EmbeddingGenerator,
10
+ KeywordsGenerator,
11
+ Summarizer,
12
+ TopicsGenerator,
13
+ Translator,
14
+ )
15
  from .pdf_processor import PDFProcessor
16
 
17
  processors = {
18
+ "pdf": PDFProcessor,
19
  }
20
 
21
+
22
+ class WorkFlowController:
23
  def __init__(self, file_src) -> None:
24
  # check if the file_path is list
25
  # self.file_paths = self.__get_file_name(file_src)
 
30
  self.files_info = {}
31
 
32
  for file_path in self.file_paths:
33
+ file_name = file_path.split("/")[-1]
34
+ file_format = file_path.split(".")[-1]
35
  self.file_processor = processors[file_format]
36
  file = self.file_processor(file_path).file_info
37
  file = self.__process_file(file)
 
40
  self.__dump_to_json()
41
  self.__dump_to_csv()
42
 
 
43
  def __get_summary(self, file: dict):
44
  # get summary from file content
45
+
46
  summarizer = Summarizer()
47
+ file["summarized_content"] = summarizer.summarize(file["file_full_content"])
48
  return file
49
 
50
  def __get_keywords(self, file: dict):
51
  # get keywords from file content
52
  keywords_generator = KeywordsGenerator()
53
+ file["keywords"] = keywords_generator.extract_keywords(
54
+ file["file_full_content"]
55
+ )
56
  return file
57
 
58
  def __get_topics(self, file: dict):
59
  # get topics from file content
60
  topics_generator = TopicsGenerator()
61
+ file["topics"] = topics_generator.extract_topics(file["file_full_content"])
62
  return file
63
 
64
  def __get_embedding(self, file):
 
66
  # return embedding
67
  embedding_generator = EmbeddingGenerator()
68
 
69
+ for i, _ in enumerate(file["file_content"]):
70
  # use i+1 to meet the index of file_content
71
+ file["file_content"][i + 1][
72
+ "page_embedding"
73
+ ] = embedding_generator.get_embedding(
74
+ file["file_content"][i + 1]["page_content"]
75
+ )
76
  return file
 
77
 
78
  def __translate_to_chinese(self, file: dict):
79
  # translate file content to chinese
80
  translator = Translator()
81
  # reset the file full content
82
+ file["file_full_content"] = ""
83
 
84
+ for i, _ in enumerate(file["file_content"]):
85
  # use i+1 to meet the index of file_content
86
+ file["file_content"][i + 1][
87
+ "page_content"
88
+ ] = translator.translate_to_chinese(
89
+ file["file_content"][i + 1]["page_content"]
90
+ )
91
+ file["file_full_content"] = (
92
+ file["file_full_content"] + file["file_content"][i + 1]["page_content"]
93
+ )
94
  return file
95
+
96
  def __process_file(self, file: dict):
97
  # process file content
98
  # return processed data
99
+ if not file["is_chinese"]:
100
  file = self.__translate_to_chinese(file)
101
  file = self.__get_embedding(file)
102
  file = self.__get_summary(file)
103
  return file
104
 
105
  def __dump_to_json(self):
106
+ with open(
107
+ os.path.join(os.getcwd(), "knowledge_base.json"), "w", encoding="utf-8"
108
+ ) as f:
109
+ print(
110
+ "Dumping to json, the path is: "
111
+ + os.path.join(os.getcwd(), "knowledge_base.json")
112
+ )
113
+ self.json_result_path = os.path.join(os.getcwd(), "knowledge_base.json")
114
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
115
 
116
  def __construct_knowledge_base_dataframe(self):
 
117
  rows = []
118
  for file_path, content in self.files_info.items():
119
  file_full_content = content["file_full_content"]
 
127
  }
128
  rows.append(row)
129
 
130
+ columns = [
131
+ "file_name",
132
+ "page_num",
133
+ "page_content",
134
+ "page_embedding",
135
+ "file_full_content",
136
+ ]
137
  df = pd.DataFrame(rows, columns=columns)
138
  return df
139
 
140
  def __dump_to_csv(self):
141
  df = self.__construct_knowledge_base_dataframe()
142
+ df.to_csv(os.path.join(os.getcwd(), "knowledge_base.csv"), index=False)
143
+ print(
144
+ "Dumping to csv, the path is: "
145
+ + os.path.join(os.getcwd(), "knowledge_base.csv")
146
+ )
147
+ self.csv_result_path = os.path.join(os.getcwd(), "knowledge_base.csv")
148
 
149
  def __get_file_name(self, file_src):
150
  file_paths = [x.name for x in file_src]
 
156
  while chunk := f.read(8192):
157
  md5_hash.update(chunk)
158
 
159
+ return md5_hash.hexdigest()