ChenyuRabbitLove commited on
Commit
e873140
·
1 Parent(s): ca406a4

Revert "fix/ format and modify __get_index_file sequence"

Browse files

This reverts commit 89eba912dc6d3bd3743bef5335bb251459ad49fc.

app.py CHANGED
@@ -1,14 +1,35 @@
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
2
 
 
 
3
  from utils.chatbot import Chatbot
4
  from utils.utils import *
 
 
 
 
5
 
6
- # start of gradio interface
7
  with gr.Blocks() as demo:
 
 
 
 
8
  user_chatbot = gr.State(Chatbot())
 
 
 
9
 
10
  with gr.Row():
11
- gr.HTML("Junyi Academy Chatbot")
12
  with gr.Row(equal_height=True):
13
  with gr.Column(scale=5):
14
  with gr.Row():
@@ -27,55 +48,48 @@ with gr.Blocks() as demo:
27
  with gr.Column(min_width=70, scale=1):
28
  submit_btn = gr.Button("傳送")
29
 
30
- with gr.Row():
31
- index_file = gr.File(
32
- file_count="multiple", file_types=["pdf"], label="Upload PDF file"
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  with gr.Row():
36
- instruction = gr.Markdown(
37
- """
 
 
38
  ## 使用說明
39
  1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
40
  2. 在上方輸入欄輸入問題,系統將自動回覆
41
  3. 可以根據下方的摘要內容來提問
42
  4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
43
- 5. 要切換檢索的文件,請點選「清除」按鈕後再重新提問
44
- """
45
- )
46
 
47
  with gr.Row():
48
- describe = gr.Markdown("", visible=True)
49
-
50
- # end of gradio interface
51
 
52
- # start of workflow controller
53
-
54
- # defining workflow of user bot interaction
55
- bot_args = dict(
56
- fn=bot,
57
- inputs=user_chatbot,
58
- outputs=chatbot,
59
- )
60
-
61
- user_args = dict(
62
- fn=user,
63
- inputs=[user_chatbot, user_input],
64
- outputs=[user_input, chatbot],
65
- queue=False,
66
- )
67
-
68
- response = user_input.submit(**user_args).then(**bot_args)
69
-
70
- response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
71
-
72
- submit_btn.click(
73
- **user_args,
74
- ).then(
75
- **bot_args
76
- ).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
77
-
78
- # defining workflow of clear state
79
  clear_state_args = dict(
80
  fn=clear_state,
81
  inputs=user_chatbot,
@@ -84,7 +98,6 @@ with gr.Blocks() as demo:
84
 
85
  clear_btn.click(**clear_state_args)
86
 
87
- # defining workflow of building knowledge base
88
  send_system_nofification_args = dict(
89
  fn=send_system_nofification,
90
  inputs=user_chatbot,
@@ -103,13 +116,12 @@ with gr.Blocks() as demo:
103
  outputs=[describe],
104
  )
105
 
106
- index_file.upload(**send_system_nofification_args).then(
107
- lambda: gr.update(interactive=True), None, None, queue=False
108
- ).then(**bulid_knowledge_base_args).then(**send_system_nofification_args).then(
109
- lambda: gr.update(interactive=True), None, None, queue=False
110
- ).then(
111
- **change_md_args
112
- )
113
-
114
  if __name__ == "__main__":
115
  demo.launch()
 
1
+ import json
2
+ import time
3
+ import random
4
+ import os
5
+
6
+ import openai
7
  import gradio as gr
8
+ import pandas as pd
9
+ import numpy as np
10
+ from openai.embeddings_utils import distances_from_embeddings
11
 
12
+ from utils.gpt_processor import QuestionAnswerer
13
+ from utils.work_flow_controller import WorkFlowController
14
  from utils.chatbot import Chatbot
15
  from utils.utils import *
16
+
17
+ def create_chatbot():
18
+ bot = Chatbot()
19
+ return bot
20
 
 
21
  with gr.Blocks() as demo:
22
+ history = gr.State([])
23
+ user_question = gr.State("")
24
+ chatbot_utils = Chatbot()
25
+
26
  user_chatbot = gr.State(Chatbot())
27
+
28
+ upload_state = gr.State("wating")
29
+ finished = gr.State("finished")
30
 
31
  with gr.Row():
32
+ gr.HTML('Junyi Academy Chatbot')
33
  with gr.Row(equal_height=True):
34
  with gr.Column(scale=5):
35
  with gr.Row():
 
48
  with gr.Column(min_width=70, scale=1):
49
  submit_btn = gr.Button("傳送")
50
 
51
+ bot_args = dict(
52
+ fn=bot,
53
+ inputs=user_chatbot,
54
+ outputs=chatbot,
55
+ )
56
+
57
+ user_args = dict(
58
+ fn=user,
59
+ inputs=[user_chatbot, user_input],
60
+ outputs=[user_input, chatbot],
61
+ queue=False,
62
+ )
63
+
64
+ response = user_input.submit(**user_args).then(**bot_args)
65
+
66
+ response.then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
67
+
68
+ submit_btn.click(user,
69
+ [user_input, chatbot],
70
+ [user_input, chatbot],
71
+ chatbot,
72
+ queue=False).then(**bot_args).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
73
+
74
+
75
 
76
  with gr.Row():
77
+ index_file = gr.File(file_count="multiple", file_types=["pdf"], label="Upload PDF file")
78
+
79
+ with gr.Row():
80
+ instruction = gr.Markdown("""
81
  ## 使用說明
82
  1. 上傳一個或多個 PDF 檔案,系統將自動進行摘要、翻譯等處理後建立知識庫
83
  2. 在上方輸入欄輸入問題,系統將自動回覆
84
  3. 可以根據下方的摘要內容來提問
85
  4. 每次對話會根據第一個問題的內容來檢索所有文件,並挑選最能回答問題的文件來回覆
86
+ 5. 要切換檢索的文件,請點選「清除對話記錄」按鈕後再重新提問
87
+ """)
 
88
 
89
  with gr.Row():
90
+ describe = gr.Markdown('', visible=True)
 
 
91
 
92
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  clear_state_args = dict(
94
  fn=clear_state,
95
  inputs=user_chatbot,
 
98
 
99
  clear_btn.click(**clear_state_args)
100
 
 
101
  send_system_nofification_args = dict(
102
  fn=send_system_nofification,
103
  inputs=user_chatbot,
 
116
  outputs=[describe],
117
  )
118
 
119
+ index_file.upload(**send_system_nofification_args) \
120
+ .then(lambda: gr.update(interactive=True), None, None, queue=False) \
121
+ .then(**bulid_knowledge_base_args) \
122
+ .then(**send_system_nofification_args) \
123
+ .then(lambda: gr.update(interactive=True), None, None, queue=False) \
124
+ .then(**change_md_args)
125
+
 
126
  if __name__ == "__main__":
127
  demo.launch()
utils/chatbot.py CHANGED
@@ -9,59 +9,57 @@ from openai.embeddings_utils import distances_from_embeddings
9
  from .work_flow_controller import WorkFlowController
10
  from .gpt_processor import QuestionAnswerer
11
 
12
-
13
- class Chatbot:
14
  def __init__(self) -> None:
15
  self.history = []
16
- self.upload_state = "waiting"
17
-
18
  self.knowledge_base = None
19
  self.context = None
20
  self.context_page_num = None
21
  self.context_file_name = None
 
22
 
23
  def build_knowledge_base(self, files):
24
  work_flow_controller = WorkFlowController(files)
25
  self.csv_result_path = work_flow_controller.csv_result_path
26
  self.json_result_path = work_flow_controller.json_result_path
27
 
28
- with open(self.csv_result_path, "r", encoding="UTF-8") as fp:
29
  knowledge_base = pd.read_csv(fp)
30
- knowledge_base["page_embedding"] = (
31
- knowledge_base["page_embedding"].apply(eval).apply(np.array)
32
- )
33
 
34
  self.knowledge_base = knowledge_base
35
- self.upload_state = "done"
36
 
37
  def clear_state(self):
38
  self.context = None
39
  self.context_page_num = None
40
  self.context_file_name = None
41
- self.upload_state = "waiting"
42
  self.history = []
43
 
44
  def send_system_nofification(self):
45
- if self.upload_state == "waiting":
46
- conversation = [["已上傳文件", "文件處理中(摘要、翻譯等),結束後將自動回覆"]]
47
  return conversation
48
- elif self.upload_state == "done":
49
- conversation = [["已上傳文件", "文件處理完成,請開始提問"]]
50
  return conversation
51
-
52
  def change_md(self):
53
  content = self.__construct_summary()
54
  return gr.Markdown.update(content, visible=True)
55
-
56
  def __construct_summary(self):
57
- with open(self.json_result_path, "r", encoding="UTF-8") as fp:
58
  knowledge_base = json.load(fp)
59
 
60
  context = """"""
61
  for key in knowledge_base.keys():
62
- file_name = knowledge_base[key]["file_name"]
63
- total_page = knowledge_base[key]["total_pages"]
64
- summary = knowledge_base[key]["summarized_content"]
65
  file_context = f"""
66
  ### 文件摘要
67
  {file_name} (共 {total_page} 頁)<br><br>
@@ -69,14 +67,14 @@ class Chatbot:
69
  """
70
  context += file_context
71
  return context
72
-
73
  def user(self, message):
74
  self.history += [[message, None]]
75
  return "", self.history
76
-
77
  def bot(self):
78
  user_message = self.history[-1][0]
79
- print(f"user_message: {user_message}")
80
 
81
  if self.knowledge_base is None:
82
  response = [
@@ -84,47 +82,41 @@ class Chatbot:
84
  ]
85
  self.history = response
86
  return self.history
87
-
88
- else:
89
  self.__get_index_file(user_message)
 
90
  if self.context is None:
91
  response = [
92
  [user_message, "無法找到相關文件,請重新提問"],
93
  ]
94
  self.history = response
95
  return self.history
96
- else:
97
- qa_processor = QuestionAnswerer()
98
- bot_message = qa_processor.answer_question(
99
- self.context,
100
- self.context_page_num,
101
- self.context_file_name,
102
- self.history,
103
- )
104
- print(f"bot_message: {bot_message}")
105
- response = [
106
- [user_message, bot_message],
107
- ]
108
- self.history[-1] = response[0]
109
- return self.history
110
-
 
 
 
111
  def __get_index_file(self, user_message):
112
- user_message_embedding = openai.Embedding.create(
113
- input=user_message, engine="text-embedding-ada-002"
114
- )["data"][0]["embedding"]
115
-
116
- self.knowledge_base["distance"] = distances_from_embeddings(
117
- user_message_embedding,
118
- self.knowledge_base["page_embedding"].values,
119
- distance_metric="cosine",
120
- )
121
- self.knowledge_base = self.knowledge_base.sort_values(
122
- by="distance", ascending=True
123
- )
124
 
125
- if self.knowledge_base["distance"].values[0] > 0.2:
126
  self.context = None
127
  else:
128
- self.context = self.knowledge_base["page_content"].values[0]
129
- self.context_page_num = self.knowledge_base["page_num"].values[0]
130
- self.context_file_name = self.knowledge_base["file_name"].values[0]
 
9
  from .work_flow_controller import WorkFlowController
10
  from .gpt_processor import QuestionAnswerer
11
 
12
+ class Chatbot():
 
13
  def __init__(self) -> None:
14
  self.history = []
15
+ self.upload_state = 'waiting'
16
+
17
  self.knowledge_base = None
18
  self.context = None
19
  self.context_page_num = None
20
  self.context_file_name = None
21
+
22
 
23
  def build_knowledge_base(self, files):
24
  work_flow_controller = WorkFlowController(files)
25
  self.csv_result_path = work_flow_controller.csv_result_path
26
  self.json_result_path = work_flow_controller.json_result_path
27
 
28
+ with open(self.csv_result_path, 'r', encoding='UTF-8') as fp:
29
  knowledge_base = pd.read_csv(fp)
30
+ knowledge_base['page_embedding'] = knowledge_base['page_embedding'].apply(eval).apply(np.array)
 
 
31
 
32
  self.knowledge_base = knowledge_base
33
+ self.upload_state = 'done'
34
 
35
  def clear_state(self):
36
  self.context = None
37
  self.context_page_num = None
38
  self.context_file_name = None
39
+ self.upload_state = 'waiting'
40
  self.history = []
41
 
42
  def send_system_nofification(self):
43
+ if self.upload_state == 'waiting':
44
+ conversation = [['已上傳文件', '文件處理中(摘要、翻譯等),結束後將自動回覆']]
45
  return conversation
46
+ elif self.upload_state == 'done':
47
+ conversation = [['已上傳文件', '文件處理完成,請開始提問']]
48
  return conversation
49
+
50
  def change_md(self):
51
  content = self.__construct_summary()
52
  return gr.Markdown.update(content, visible=True)
53
+
54
  def __construct_summary(self):
55
+ with open(self.json_result_path, 'r', encoding='UTF-8') as fp:
56
  knowledge_base = json.load(fp)
57
 
58
  context = """"""
59
  for key in knowledge_base.keys():
60
+ file_name = knowledge_base[key]['file_name']
61
+ total_page = knowledge_base[key]['total_pages']
62
+ summary = knowledge_base[key]['summarized_content']
63
  file_context = f"""
64
  ### 文件摘要
65
  {file_name} (共 {total_page} 頁)<br><br>
 
67
  """
68
  context += file_context
69
  return context
70
+
71
  def user(self, message):
72
  self.history += [[message, None]]
73
  return "", self.history
74
+
75
  def bot(self):
76
  user_message = self.history[-1][0]
77
+ print(f'user_message: {user_message}')
78
 
79
  if self.knowledge_base is None:
80
  response = [
 
82
  ]
83
  self.history = response
84
  return self.history
85
+ elif self.context is None:
 
86
  self.__get_index_file(user_message)
87
+ print(f'CONTEXT: {self.context}')
88
  if self.context is None:
89
  response = [
90
  [user_message, "無法找到相關文件,請重新提問"],
91
  ]
92
  self.history = response
93
  return self.history
94
+ else:
95
+ pass
96
+
97
+ if self.context is not None:
98
+ qa_processor = QuestionAnswerer()
99
+ bot_message = qa_processor.answer_question(
100
+ self.context,
101
+ self.context_page_num,
102
+ self.context_file_name,
103
+ self.history
104
+ )
105
+ print(f'bot_message: {bot_message}')
106
+ response = [
107
+ [user_message, bot_message],
108
+ ]
109
+ self.history[-1] = response[0]
110
+ return self.history
111
+
112
  def __get_index_file(self, user_message):
113
+ user_message_embedding = openai.Embedding.create(input=user_message, engine='text-embedding-ada-002')['data'][0]['embedding']
114
+ self.knowledge_base['distance'] = distances_from_embeddings(user_message_embedding, self.knowledge_base['page_embedding'].values, distance_metric='cosine')
115
+ self.knowledge_base = self.knowledge_base.sort_values(by='distance', ascending=True).head(1)
 
 
 
 
 
 
 
 
 
116
 
117
+ if self.knowledge_base['distance'].values[0] > 0.2:
118
  self.context = None
119
  else:
120
+ self.context = self.knowledge_base['page_content'].values[0]
121
+ self.context_page_num = self.knowledge_base['page_num'].values[0]
122
+ self.context_file_name = self.knowledge_base['file_name'].values[0]
utils/docx_processor.py CHANGED
@@ -6,14 +6,13 @@ import docx2txt
6
 
7
  from gpt_processor import Translator
8
 
9
-
10
  class DOCXProcessor:
11
  def __init__(self, file_path: str) -> None:
12
  self.file_path = file_path
13
  self.file_info = {
14
- "file_name": self.file_path.split("/")[-1],
15
- "file_format": "DOCX",
16
- "file_full_content": "",
17
  }
18
  self.__build_info()
19
 
@@ -21,24 +20,21 @@ class DOCXProcessor:
21
  try:
22
  text = docx2txt.process(self.file_path)
23
  text = unicodedata.normalize("NFKD", text)
24
- text = text.replace("\n", " ").replace("\r", "")
25
- text = re.sub(" +", " ", text)
26
- self.file_info["is_chinese"] = self.__is_chinese(text)
27
 
28
  tranlator = Translator()
29
- self.file_info["file_full_content"] = (
30
- tranlator.translate_to_chinese(text)
31
- if not self.file_info["is_chinese"]
32
- else text
33
- )
34
 
 
35
  except FileNotFoundError:
36
  print(f"File not found: {self.file_path}")
37
  except Exception as e:
38
  print(f"An error occurred: {str(e)}")
39
-
40
  def __is_chinese(self, text: str) -> bool:
41
  for char in text:
42
- if char >= "\u4e00" and char <= "\u9fff":
43
  return True
44
- return False
 
6
 
7
  from gpt_processor import Translator
8
 
 
9
  class DOCXProcessor:
10
  def __init__(self, file_path: str) -> None:
11
  self.file_path = file_path
12
  self.file_info = {
13
+ 'file_name': self.file_path.split('/')[-1],
14
+ 'file_format': 'DOCX',
15
+ 'file_full_content': '',
16
  }
17
  self.__build_info()
18
 
 
20
  try:
21
  text = docx2txt.process(self.file_path)
22
  text = unicodedata.normalize("NFKD", text)
23
+ text = text.replace('\n', ' ').replace('\r', '')
24
+ text = re.sub(' +', ' ', text)
25
+ self.file_info['is_chinese'] = self.__is_chinese(text)
26
 
27
  tranlator = Translator()
28
+ self.file_info['file_full_content'] = tranlator.translate_to_chinese(text) if not self.file_info['is_chinese'] else text
 
 
 
 
29
 
30
+
31
  except FileNotFoundError:
32
  print(f"File not found: {self.file_path}")
33
  except Exception as e:
34
  print(f"An error occurred: {str(e)}")
35
+
36
  def __is_chinese(self, text: str) -> bool:
37
  for char in text:
38
+ if char >= '\u4e00' and char <= '\u9fff':
39
  return True
40
+ return False
utils/pdf_processor.py CHANGED
@@ -5,48 +5,45 @@ import logging
5
 
6
  from .gpt_processor import Translator
7
 
8
-
9
  class PDFProcessor:
10
  def __init__(self, file_path: str) -> None:
11
  self.file_path = file_path
12
  self.file_info = {
13
- "file_name": self.file_path.split("/")[-1],
14
- "file_format": "PDF",
15
- "total_pages": 0,
16
- "file_content": {},
17
- "file_full_content": "",
18
- "is_chinese": "",
19
  }
20
  self.__build_info()
21
 
22
  def __build_info(self) -> None:
23
  try:
24
- with open(self.file_path, "rb") as pdf_file:
25
  pdf_reader = PyPDF2.PdfReader(pdf_file)
26
  pages = len(pdf_reader.pages)
27
- self.file_info["total_pages"] = pages
28
  for i, page in enumerate(pdf_reader.pages):
29
  text = page.extract_text()
30
  text = unicodedata.normalize("NFKD", text)
31
- text = text.replace("\n", " ").replace("\r", "")
32
- text = re.sub(" +", " ", text)
33
- self.file_info["is_chinese"] = self.__is_chinese(text)
34
 
35
  page_info = {}
36
  logging.info(f"Processing page {i + 1}...")
37
- page_info["page_num"] = i + 1
38
- page_info["page_content"] = text
39
- self.file_info["file_content"][i + 1] = page_info
40
- self.file_info["file_full_content"] = (
41
- self.file_info["file_full_content"] + page_info["page_content"]
42
- )
43
  except FileNotFoundError:
44
  print(f"File not found: {self.file_path}")
45
  except Exception as e:
46
  print(f"An error occurred: {str(e)}")
47
-
48
  def __is_chinese(self, text: str) -> bool:
49
  for char in text:
50
- if char >= "\u4e00" and char <= "\u9fff":
51
  return True
52
- return False
 
5
 
6
  from .gpt_processor import Translator
7
 
 
8
  class PDFProcessor:
9
  def __init__(self, file_path: str) -> None:
10
  self.file_path = file_path
11
  self.file_info = {
12
+ 'file_name': self.file_path.split('/')[-1],
13
+ 'file_format': 'PDF',
14
+ 'total_pages': 0,
15
+ 'file_content': {},
16
+ 'file_full_content': '',
17
+ 'is_chinese': '',
18
  }
19
  self.__build_info()
20
 
21
  def __build_info(self) -> None:
22
  try:
23
+ with open(self.file_path, 'rb') as pdf_file:
24
  pdf_reader = PyPDF2.PdfReader(pdf_file)
25
  pages = len(pdf_reader.pages)
26
+ self.file_info['total_pages'] = pages
27
  for i, page in enumerate(pdf_reader.pages):
28
  text = page.extract_text()
29
  text = unicodedata.normalize("NFKD", text)
30
+ text = text.replace('\n', ' ').replace('\r', '')
31
+ text = re.sub(' +', ' ', text)
32
+ self.file_info['is_chinese'] = self.__is_chinese(text)
33
 
34
  page_info = {}
35
  logging.info(f"Processing page {i + 1}...")
36
+ page_info['page_num'] = i + 1
37
+ page_info['page_content'] = text
38
+ self.file_info['file_content'][i + 1] = page_info
39
+ self.file_info['file_full_content'] = self.file_info['file_full_content'] + page_info['page_content']
 
 
40
  except FileNotFoundError:
41
  print(f"File not found: {self.file_path}")
42
  except Exception as e:
43
  print(f"An error occurred: {str(e)}")
44
+
45
  def __is_chinese(self, text: str) -> bool:
46
  for char in text:
47
+ if char >= '\u4e00' and char <= '\u9fff':
48
  return True
49
+ return False
utils/utils.py CHANGED
@@ -1,26 +1,21 @@
 
1
  def clear_state(chatbot, *args):
2
  return chatbot.clear_state(*args)
3
 
4
-
5
  def send_system_nofification(chatbot, *args):
6
  return chatbot.send_system_nofification(*args)
7
 
8
-
9
  def build_knowledge_base(chatbot, *args):
10
  return chatbot.build_knowledge_base(*args)
11
 
12
-
13
  def change_md(chatbot, *args):
14
  return chatbot.change_md(*args)
15
 
16
-
17
  def get_index_file(chatbot, *args):
18
  return chatbot.get_index_file(*args)
19
 
20
-
21
  def user(chatbot, *args):
22
  return chatbot.user(*args)
23
 
24
-
25
  def bot(chatbot, *args):
26
- return chatbot.bot(*args)
 
1
+
2
  def clear_state(chatbot, *args):
3
  return chatbot.clear_state(*args)
4
 
 
5
  def send_system_nofification(chatbot, *args):
6
  return chatbot.send_system_nofification(*args)
7
 
 
8
  def build_knowledge_base(chatbot, *args):
9
  return chatbot.build_knowledge_base(*args)
10
 
 
11
  def change_md(chatbot, *args):
12
  return chatbot.change_md(*args)
13
 
 
14
  def get_index_file(chatbot, *args):
15
  return chatbot.get_index_file(*args)
16
 
 
17
  def user(chatbot, *args):
18
  return chatbot.user(*args)
19
 
 
20
  def bot(chatbot, *args):
21
+ return chatbot.bot(*args)
utils/work_flow_controller.py CHANGED
@@ -5,21 +5,15 @@ import hashlib
5
 
6
  import pandas as pd
7
 
8
- from .gpt_processor import (
9
- EmbeddingGenerator,
10
- KeywordsGenerator,
11
- Summarizer,
12
- TopicsGenerator,
13
- Translator,
14
- )
15
  from .pdf_processor import PDFProcessor
16
 
17
  processors = {
18
- "pdf": PDFProcessor,
19
  }
20
 
21
-
22
- class WorkFlowController:
23
  def __init__(self, file_src) -> None:
24
  # check if the file_path is list
25
  # self.file_paths = self.__get_file_name(file_src)
@@ -30,8 +24,8 @@ class WorkFlowController:
30
  self.files_info = {}
31
 
32
  for file_path in self.file_paths:
33
- file_name = file_path.split("/")[-1]
34
- file_format = file_path.split(".")[-1]
35
  self.file_processor = processors[file_format]
36
  file = self.file_processor(file_path).file_info
37
  file = self.__process_file(file)
@@ -40,25 +34,24 @@ class WorkFlowController:
40
  self.__dump_to_json()
41
  self.__dump_to_csv()
42
 
 
43
  def __get_summary(self, file: dict):
44
  # get summary from file content
45
-
46
  summarizer = Summarizer()
47
- file["summarized_content"] = summarizer.summarize(file["file_full_content"])
48
  return file
49
 
50
  def __get_keywords(self, file: dict):
51
  # get keywords from file content
52
  keywords_generator = KeywordsGenerator()
53
- file["keywords"] = keywords_generator.extract_keywords(
54
- file["file_full_content"]
55
- )
56
  return file
57
 
58
  def __get_topics(self, file: dict):
59
  # get topics from file content
60
  topics_generator = TopicsGenerator()
61
- file["topics"] = topics_generator.extract_topics(file["file_full_content"])
62
  return file
63
 
64
  def __get_embedding(self, file):
@@ -66,54 +59,41 @@ class WorkFlowController:
66
  # return embedding
67
  embedding_generator = EmbeddingGenerator()
68
 
69
- for i, _ in enumerate(file["file_content"]):
70
  # use i+1 to meet the index of file_content
71
- file["file_content"][i + 1][
72
- "page_embedding"
73
- ] = embedding_generator.get_embedding(
74
- file["file_content"][i + 1]["page_content"]
75
- )
76
  return file
 
77
 
78
  def __translate_to_chinese(self, file: dict):
79
  # translate file content to chinese
80
  translator = Translator()
81
  # reset the file full content
82
- file["file_full_content"] = ""
83
 
84
- for i, _ in enumerate(file["file_content"]):
85
  # use i+1 to meet the index of file_content
86
- file["file_content"][i + 1][
87
- "page_content"
88
- ] = translator.translate_to_chinese(
89
- file["file_content"][i + 1]["page_content"]
90
- )
91
- file["file_full_content"] = (
92
- file["file_full_content"] + file["file_content"][i + 1]["page_content"]
93
- )
94
  return file
95
-
96
  def __process_file(self, file: dict):
97
  # process file content
98
  # return processed data
99
- if not file["is_chinese"]:
100
  file = self.__translate_to_chinese(file)
101
  file = self.__get_embedding(file)
102
  file = self.__get_summary(file)
103
  return file
104
 
105
  def __dump_to_json(self):
106
- with open(
107
- os.path.join(os.getcwd(), "knowledge_base.json"), "w", encoding="utf-8"
108
- ) as f:
109
- print(
110
- "Dumping to json, the path is: "
111
- + os.path.join(os.getcwd(), "knowledge_base.json")
112
- )
113
- self.json_result_path = os.path.join(os.getcwd(), "knowledge_base.json")
114
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
115
 
116
  def __construct_knowledge_base_dataframe(self):
 
117
  rows = []
118
  for file_path, content in self.files_info.items():
119
  file_full_content = content["file_full_content"]
@@ -127,24 +107,15 @@ class WorkFlowController:
127
  }
128
  rows.append(row)
129
 
130
- columns = [
131
- "file_name",
132
- "page_num",
133
- "page_content",
134
- "page_embedding",
135
- "file_full_content",
136
- ]
137
  df = pd.DataFrame(rows, columns=columns)
138
  return df
139
 
140
  def __dump_to_csv(self):
141
  df = self.__construct_knowledge_base_dataframe()
142
- df.to_csv(os.path.join(os.getcwd(), "knowledge_base.csv"), index=False)
143
- print(
144
- "Dumping to csv, the path is: "
145
- + os.path.join(os.getcwd(), "knowledge_base.csv")
146
- )
147
- self.csv_result_path = os.path.join(os.getcwd(), "knowledge_base.csv")
148
 
149
  def __get_file_name(self, file_src):
150
  file_paths = [x.name for x in file_src]
@@ -156,4 +127,4 @@ class WorkFlowController:
156
  while chunk := f.read(8192):
157
  md5_hash.update(chunk)
158
 
159
- return md5_hash.hexdigest()
 
5
 
6
  import pandas as pd
7
 
8
+ from .gpt_processor import (EmbeddingGenerator, KeywordsGenerator, Summarizer,
9
+ TopicsGenerator, Translator)
 
 
 
 
 
10
  from .pdf_processor import PDFProcessor
11
 
12
  processors = {
13
+ 'pdf': PDFProcessor,
14
  }
15
 
16
+ class WorkFlowController():
 
17
  def __init__(self, file_src) -> None:
18
  # check if the file_path is list
19
  # self.file_paths = self.__get_file_name(file_src)
 
24
  self.files_info = {}
25
 
26
  for file_path in self.file_paths:
27
+ file_name = file_path.split('/')[-1]
28
+ file_format = file_path.split('.')[-1]
29
  self.file_processor = processors[file_format]
30
  file = self.file_processor(file_path).file_info
31
  file = self.__process_file(file)
 
34
  self.__dump_to_json()
35
  self.__dump_to_csv()
36
 
37
+
38
  def __get_summary(self, file: dict):
39
  # get summary from file content
40
+
41
  summarizer = Summarizer()
42
+ file['summarized_content'] = summarizer.summarize(file['file_full_content'])
43
  return file
44
 
45
  def __get_keywords(self, file: dict):
46
  # get keywords from file content
47
  keywords_generator = KeywordsGenerator()
48
+ file['keywords'] = keywords_generator.extract_keywords(file['file_full_content'])
 
 
49
  return file
50
 
51
  def __get_topics(self, file: dict):
52
  # get topics from file content
53
  topics_generator = TopicsGenerator()
54
+ file['topics'] = topics_generator.extract_topics(file['file_full_content'])
55
  return file
56
 
57
  def __get_embedding(self, file):
 
59
  # return embedding
60
  embedding_generator = EmbeddingGenerator()
61
 
62
+ for i, _ in enumerate(file['file_content']):
63
  # use i+1 to meet the index of file_content
64
+ file['file_content'][i+1]['page_embedding'] = embedding_generator.get_embedding(file['file_content'][i+1]['page_content'])
 
 
 
 
65
  return file
66
+
67
 
68
  def __translate_to_chinese(self, file: dict):
69
  # translate file content to chinese
70
  translator = Translator()
71
  # reset the file full content
72
+ file['file_full_content'] = ''
73
 
74
+ for i, _ in enumerate(file['file_content']):
75
  # use i+1 to meet the index of file_content
76
+ file['file_content'][i+1]['page_content'] = translator.translate_to_chinese(file['file_content'][i+1]['page_content'])
77
+ file['file_full_content'] = file['file_full_content'] + file['file_content'][i+1]['page_content']
 
 
 
 
 
 
78
  return file
79
+
80
  def __process_file(self, file: dict):
81
  # process file content
82
  # return processed data
83
+ if not file['is_chinese']:
84
  file = self.__translate_to_chinese(file)
85
  file = self.__get_embedding(file)
86
  file = self.__get_summary(file)
87
  return file
88
 
89
  def __dump_to_json(self):
90
+ with open(os.path.join(os.getcwd(), 'knowledge_base.json'), 'w', encoding='utf-8') as f:
91
+ print("Dumping to json, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.json'))
92
+ self.json_result_path = os.path.join(os.getcwd(), 'knowledge_base.json')
 
 
 
 
 
93
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
94
 
95
  def __construct_knowledge_base_dataframe(self):
96
+
97
  rows = []
98
  for file_path, content in self.files_info.items():
99
  file_full_content = content["file_full_content"]
 
107
  }
108
  rows.append(row)
109
 
110
+ columns = ["file_name", "page_num", "page_content", "page_embedding", "file_full_content"]
 
 
 
 
 
 
111
  df = pd.DataFrame(rows, columns=columns)
112
  return df
113
 
114
  def __dump_to_csv(self):
115
  df = self.__construct_knowledge_base_dataframe()
116
+ df.to_csv(os.path.join(os.getcwd(), 'knowledge_base.csv'), index=False)
117
+ print("Dumping to csv, the path is: " + os.path.join(os.getcwd(), 'knowledge_base.csv'))
118
+ self.csv_result_path = os.path.join(os.getcwd(), 'knowledge_base.csv')
 
 
 
119
 
120
  def __get_file_name(self, file_src):
121
  file_paths = [x.name for x in file_src]
 
127
  while chunk := f.read(8192):
128
  md5_hash.update(chunk)
129
 
130
+ return md5_hash.hexdigest()