ChenyuRabbitLove commited on
Commit
66b707b
·
1 Parent(s): c88c1d9

feat/formatter

Browse files
Files changed (4) hide show
  1. app.py +9 -3
  2. utils/chatbot.py +58 -41
  3. utils/utils.py +1 -0
  4. utils/work_flow_controller.py +12 -4
app.py CHANGED
@@ -34,7 +34,9 @@ with gr.Blocks() as demo:
34
  )
35
  upload_to_db = gr.CheckboxGroup(
36
  ["Upload to Database"],
37
- label="是否上傳至資料庫", info="將資料上傳至資料庫時,資料庫會自動建立索引,下次使用時可以直接檢索,預設為僅作這次使用", scale=1
 
 
38
  )
39
 
40
  with gr.Row():
@@ -85,7 +87,6 @@ with gr.Blocks() as demo:
85
  **bot_args
86
  ).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
87
 
88
-
89
  # defining workflow of clear state
90
  clear_state_args = dict(
91
  fn=clear_state,
@@ -122,7 +123,12 @@ with gr.Blocks() as demo:
122
  **change_md_args
123
  )
124
 
125
- video_text_input.submit(video_bot, [test_video_chabot, video_text_input], video_text_output, api_name="video_bot")
 
 
 
 
 
126
 
127
  if __name__ == "__main__":
128
  demo.launch()
 
34
  )
35
  upload_to_db = gr.CheckboxGroup(
36
  ["Upload to Database"],
37
+ label="是否上傳至資料庫",
38
+ info="將資料上傳至資料庫時,資料庫會自動建立索引,下次使用時可以直接檢索,預設為僅作這次使用",
39
+ scale=1,
40
  )
41
 
42
  with gr.Row():
 
87
  **bot_args
88
  ).then(lambda: gr.update(interactive=True), None, [user_input], queue=False)
89
 
 
90
  # defining workflow of clear state
91
  clear_state_args = dict(
92
  fn=clear_state,
 
123
  **change_md_args
124
  )
125
 
126
+ video_text_input.submit(
127
+ video_bot,
128
+ [test_video_chabot, video_text_input],
129
+ video_text_output,
130
+ api_name="video_bot",
131
+ )
132
 
133
  if __name__ == "__main__":
134
  demo.launch()
utils/chatbot.py CHANGED
@@ -19,6 +19,7 @@ from .work_flow_controller import WorkFlowController
19
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
20
  openai.api_key = OPENAI_API_KEY
21
 
 
22
  class Chatbot:
23
  def __init__(self) -> None:
24
  self.history = []
@@ -55,7 +56,7 @@ class Chatbot:
55
  continue
56
  self.knowledge_base = db
57
  self.upload_state = "done"
58
-
59
  def __get_local_knowledge_base(self):
60
  with open(self.csv_result_path, "r", encoding="UTF-8") as fp:
61
  knowledge_base = pd.read_csv(fp)
@@ -71,16 +72,22 @@ class Chatbot:
71
  # db.to_csv(f"{self.uid}_knowledge_base.csv", index=False)
72
  cur_content.to_csv(f"{self.uid}_knowledge_base.csv", index=False)
73
  media = MediaFileUpload(f"{self.uid}_knowledge_base.csv", resumable=True)
74
- request = service.files().update(fileId="1m3ozrphHP221hhdCFMFX9-10nzSDfNyW", media_body=media).execute()
75
-
 
 
 
 
76
  def __init_drive_service(self):
77
- SCOPES = ['https://www.googleapis.com/auth/drive']
78
  SERVICE_ACCOUNT_FILE = os.getenv("CREDENTIALS")
79
 
80
- creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
81
-
82
- return build('drive', 'v3', credentials=creds)
83
-
 
 
84
  def __read_db(self, service):
85
  request = service.files().get_media(fileId="1m3ozrphHP221hhdCFMFX9-10nzSDfNyW")
86
  fh = io.BytesIO()
@@ -95,13 +102,13 @@ class Chatbot:
95
  fh.seek(0)
96
 
97
  return pd.read_csv(fh)
98
-
99
  def __read_file(self, service, filename) -> pd.DataFrame:
100
  query = f"name='{filename}'"
101
  results = service.files().list(q=query).execute()
102
- files = results.get('files', [])
103
 
104
- file_id = files[0]['id']
105
 
106
  request = service.files().get_media(fileId=file_id)
107
  fh = io.BytesIO()
@@ -116,25 +123,33 @@ class Chatbot:
116
  fh.seek(0)
117
 
118
  return pd.read_csv(fh)
119
-
120
  def __upload_file(self, service):
121
  results = service.files().list(pageSize=10).execute()
122
- items = results.get('files', [])
123
  if not items:
124
- print('No files found.')
125
  else:
126
- print('Files:')
127
  for item in items:
128
  print(f"{item['name']} ({item['id']})")
129
 
130
  media = MediaFileUpload(self.csv_result_path, resumable=True)
131
- filename_prefix = 'ex_bot_database_'
132
- filename = filename_prefix + self.uid + '.csv'
133
- request = service.files().create(media_body=media, body={
134
- 'name': filename,
135
- 'parents': ["1Lp21EZlVlqL-c27VQBC6wTbUC1YpKMsG"] # Optional, to place the file in a specific folder
136
- }).execute()
137
-
 
 
 
 
 
 
 
 
138
 
139
  def clear_state(self):
140
  self.context = None
@@ -240,11 +255,11 @@ class VideoChatbot:
240
  def __init__(self) -> None:
241
  self.metadata_keys = ["標題", "逐字稿", "摘要", "關鍵字"]
242
  self.metadata = {
243
- "c2fK-hxnPSY":{
244
- "標題": "可汗學院的創新教學:學生與老師模式解析",
245
  "逐字稿": "0:00\n這裡是一個關於西班牙美洲戰爭和AP美國歷史的練習\n0:04\n在可汗學院,我們以學生模式開始,並注意到如果學生要求解釋\n0:11\n它不只是給出答案,它會像一個好的導師一樣,只是試圖引導\n0:15\n學生朝正確的方向前進,並且還注意到老師可以看到\n0:21\n學生正在互動的內容作為安全措施,現在如果我們關閉學生模式,我們\n0:27\n進入老師模式,我們看到當老師要求解釋時,它非常不同,就像\n0:32\n有了老師的指南,它會給出如你所見的非常詳細的解釋,如果老師\n0:39\n想要它的教案,他們只需要要求,他們就會得到一個非常詳細的\n0:44\n教案,包括目標、活動和家庭作業要做的事情,然後如果老師\n0:52\n說太好了,Khanmigo,你說��一個講義或者作為家庭作業給一個反思\n0:58\n實際上給了反思作業,然後它會再次為老師構建那個\n1:03\n如果老師喜歡,他們可以要求自定義這些教案或這些提示或者這些\n1:08\n反思,讓它們更符合他們的學生正在做的事情,這是老師們通常花費\n1:13\n每天好幾個小時工作的事情,我們希望能夠節省\n1:17\n他們很多時間和精力,以利他們自己的健康和他們的學生。",
246
  "摘要": "這段文字描述了一個關於西班牙美洲戰爭和AP美國歷史的教學練習。練習首先展示學生模式,強調良好的教導方式並提到教師可以監控學生互動情況作為安全措施。隨後,進入老師模式,提供了詳細的解釋和教案,包括目標、活動和家庭作業。另外,還有一個自定義教案的選項,使其更符合學生的需求。整個過程旨在節省教師的時間和精力,並有助於他們的健康和學生的學習。",
247
- "關鍵字": ["AP美國歷史", "學生模式", "老師模式", "教案設計", "自定義教學"]
248
  }
249
  }
250
 
@@ -261,16 +276,17 @@ class VideoChatbot:
261
  你是一個知識檢索系統,我會給你一份文件,請幫我依照文件內容回答問題,並用繁體中文回答。以下是文件內容
262
  """
263
  messages = [
264
- {"role": "system", "content": f"{system_prompt} + '\n' '''{context}'''"}, {"role": "user", "content": user_message}
265
- ]
 
266
  try:
267
  response = openai.ChatCompletion.create(
268
- model='gpt-3.5-turbo',
269
  messages=messages,
270
  temperature=1,
271
  max_tokens=2048,
272
  frequency_penalty=0,
273
- presence_penalty=.6,
274
  )
275
  bot_answer = response["choices"][0]["message"]["content"]
276
 
@@ -281,7 +297,7 @@ class VideoChatbot:
281
 
282
  def compute_similariy(self, user_message):
283
  threshold = 0.5
284
-
285
  user_message_embedding = openai.Embedding.create(
286
  input=user_message, engine="text-embedding-ada-002"
287
  )["data"][0]["embedding"]
@@ -290,26 +306,27 @@ class VideoChatbot:
290
 
291
  for index in self.metadata_keys:
292
  index_embedding[index] = openai.Embedding.create(
293
- input=self.metadata[self.video_id][index], engine="text-embedding-ada-002"
 
294
  )["data"][0]["embedding"]
295
 
296
  # turn index_embedding into a dataframe
297
- index_embedding = pd.DataFrame({
298
- 'title': [list(index_embedding.keys())[0]],
299
- 'embedding': [list(index_embedding.values())[0]]
300
- })
 
 
301
 
302
- index_embedding['distance'] = distances_from_embeddings(
303
  user_message_embedding,
304
- index_embedding['embedding'].values,
305
  distance_metric="cosine",
306
  )
307
 
308
- index_embedding = index_embedding.sort_values(
309
- by="distance", ascending=True
310
- )
311
 
312
  if index_embedding["distance"].values[0] > threshold:
313
  return None
314
  else:
315
- return index_embedding['title'][0]
 
19
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
20
  openai.api_key = OPENAI_API_KEY
21
 
22
+
23
  class Chatbot:
24
  def __init__(self) -> None:
25
  self.history = []
 
56
  continue
57
  self.knowledge_base = db
58
  self.upload_state = "done"
59
+
60
  def __get_local_knowledge_base(self):
61
  with open(self.csv_result_path, "r", encoding="UTF-8") as fp:
62
  knowledge_base = pd.read_csv(fp)
 
72
  # db.to_csv(f"{self.uid}_knowledge_base.csv", index=False)
73
  cur_content.to_csv(f"{self.uid}_knowledge_base.csv", index=False)
74
  media = MediaFileUpload(f"{self.uid}_knowledge_base.csv", resumable=True)
75
+ request = (
76
+ service.files()
77
+ .update(fileId="1m3ozrphHP221hhdCFMFX9-10nzSDfNyW", media_body=media)
78
+ .execute()
79
+ )
80
+
81
  def __init_drive_service(self):
82
+ SCOPES = ["https://www.googleapis.com/auth/drive"]
83
  SERVICE_ACCOUNT_FILE = os.getenv("CREDENTIALS")
84
 
85
+ creds = Credentials.from_service_account_file(
86
+ SERVICE_ACCOUNT_FILE, scopes=SCOPES
87
+ )
88
+
89
+ return build("drive", "v3", credentials=creds)
90
+
91
  def __read_db(self, service):
92
  request = service.files().get_media(fileId="1m3ozrphHP221hhdCFMFX9-10nzSDfNyW")
93
  fh = io.BytesIO()
 
102
  fh.seek(0)
103
 
104
  return pd.read_csv(fh)
105
+
106
  def __read_file(self, service, filename) -> pd.DataFrame:
107
  query = f"name='{filename}'"
108
  results = service.files().list(q=query).execute()
109
+ files = results.get("files", [])
110
 
111
+ file_id = files[0]["id"]
112
 
113
  request = service.files().get_media(fileId=file_id)
114
  fh = io.BytesIO()
 
123
  fh.seek(0)
124
 
125
  return pd.read_csv(fh)
126
+
127
  def __upload_file(self, service):
128
  results = service.files().list(pageSize=10).execute()
129
+ items = results.get("files", [])
130
  if not items:
131
+ print("No files found.")
132
  else:
133
+ print("Files:")
134
  for item in items:
135
  print(f"{item['name']} ({item['id']})")
136
 
137
  media = MediaFileUpload(self.csv_result_path, resumable=True)
138
+ filename_prefix = "ex_bot_database_"
139
+ filename = filename_prefix + self.uid + ".csv"
140
+ request = (
141
+ service.files()
142
+ .create(
143
+ media_body=media,
144
+ body={
145
+ "name": filename,
146
+ "parents": [
147
+ "1Lp21EZlVlqL-c27VQBC6wTbUC1YpKMsG"
148
+ ], # Optional, to place the file in a specific folder
149
+ },
150
+ )
151
+ .execute()
152
+ )
153
 
154
  def clear_state(self):
155
  self.context = None
 
255
  def __init__(self) -> None:
256
  self.metadata_keys = ["標題", "逐字稿", "摘要", "關鍵字"]
257
  self.metadata = {
258
+ "c2fK-hxnPSY": {
259
+ "標題": "可汗學院的創新教學:學生與老師模式解析",
260
  "逐字稿": "0:00\n這裡是一個關於西班牙美洲戰爭和AP美國歷史的練習\n0:04\n在可汗學院,我們以學生模式開始,並注意到如果學生要求解釋\n0:11\n它不只是給出答案,它會像一個好的導師一樣,只是試圖引導\n0:15\n學生朝正確的方向前進,並且還注意到老師可以看到\n0:21\n學生正在互動的內容作為安全措施,現在如果我們關閉學生模式,我們\n0:27\n進入老師模式,我們看到當老師要求解釋時,它非常不同,就像\n0:32\n有了老師的指南,它會給出如你所見的非常詳細的解釋,如果老師\n0:39\n想要它的教案,他們只需要要求,他們就會得到一個非常詳細的\n0:44\n教案,包括目標、活動和家庭作業要做的事情,然後如果老師\n0:52\n說太好了,Khanmigo,你說��一個講義或者作為家庭作業給一個反思\n0:58\n實際上給了反思作業,然後它會再次為老師構建那個\n1:03\n如果老師喜歡,他們可以要求自定義這些教案或這些提示或者這些\n1:08\n反思,讓它們更符合他們的學生正在做的事情,這是老師們通常花費\n1:13\n每天好幾個小時工作的事情,我們希望能夠節省\n1:17\n他們很多時間和精力,以利他們自己的健康和他們的學生。",
261
  "摘要": "這段文字描述了一個關於西班牙美洲戰爭和AP美國歷史的教學練習。練習首先展示學生模式,強調良好的教導方式並提到教師可以監控學生互動情況作為安全措施。隨後,進入老師模式,提供了詳細的解釋和教案,包括目標、活動和家庭作業。另外,還有一個自定義教案的選項,使其更符合學生的需求。整個過程旨在節省教師的時間和精力,並有助於他們的健康和學生的學習。",
262
+ "關鍵字": ["AP美國歷史", "學生模式", "老師模式", "教案設計", "自定義教學"],
263
  }
264
  }
265
 
 
276
  你是一個知識檢索系統,我會給你一份文件,請幫我依照文件內容回答問題,並用繁體中文回答。以下是文件內容
277
  """
278
  messages = [
279
+ {"role": "system", "content": f"{system_prompt} + '\n' '''{context}'''"},
280
+ {"role": "user", "content": user_message},
281
+ ]
282
  try:
283
  response = openai.ChatCompletion.create(
284
+ model="gpt-3.5-turbo",
285
  messages=messages,
286
  temperature=1,
287
  max_tokens=2048,
288
  frequency_penalty=0,
289
+ presence_penalty=0.6,
290
  )
291
  bot_answer = response["choices"][0]["message"]["content"]
292
 
 
297
 
298
  def compute_similariy(self, user_message):
299
  threshold = 0.5
300
+
301
  user_message_embedding = openai.Embedding.create(
302
  input=user_message, engine="text-embedding-ada-002"
303
  )["data"][0]["embedding"]
 
306
 
307
  for index in self.metadata_keys:
308
  index_embedding[index] = openai.Embedding.create(
309
+ input=self.metadata[self.video_id][index],
310
+ engine="text-embedding-ada-002",
311
  )["data"][0]["embedding"]
312
 
313
  # turn index_embedding into a dataframe
314
+ index_embedding = pd.DataFrame(
315
+ {
316
+ "title": [list(index_embedding.keys())[0]],
317
+ "embedding": [list(index_embedding.values())[0]],
318
+ }
319
+ )
320
 
321
+ index_embedding["distance"] = distances_from_embeddings(
322
  user_message_embedding,
323
+ index_embedding["embedding"].values,
324
  distance_metric="cosine",
325
  )
326
 
327
+ index_embedding = index_embedding.sort_values(by="distance", ascending=True)
 
 
328
 
329
  if index_embedding["distance"].values[0] > threshold:
330
  return None
331
  else:
332
+ return index_embedding["title"][0]
utils/utils.py CHANGED
@@ -25,5 +25,6 @@ def user(chatbot, *args):
25
  def bot(chatbot, *args):
26
  return chatbot.bot(*args)
27
 
 
28
  def video_bot(video_chatbot, *args):
29
  return video_chatbot.answer_question(*args)
 
25
  def bot(chatbot, *args):
26
  return chatbot.bot(*args)
27
 
28
+
29
  def video_bot(video_chatbot, *args):
30
  return video_chatbot.answer_question(*args)
utils/work_flow_controller.py CHANGED
@@ -109,13 +109,17 @@ class WorkFlowController:
109
 
110
  def __dump_to_json(self):
111
  with open(
112
- os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json"), "w", encoding="utf-8"
 
 
113
  ) as f:
114
  print(
115
  "Dumping to json, the path is: "
116
  + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json")
117
  )
118
- self.json_result_path = os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json")
 
 
119
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
120
 
121
  def __construct_knowledge_base_dataframe(self):
@@ -141,12 +145,16 @@ class WorkFlowController:
141
 
142
  def __dump_to_csv(self):
143
  df = self.__construct_knowledge_base_dataframe()
144
- df.to_csv(os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv"), index=False)
 
 
145
  print(
146
  "Dumping to csv, the path is: "
147
  + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv")
148
  )
149
- self.csv_result_path = os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv")
 
 
150
 
151
  def __get_file_name(self, file_src):
152
  file_paths = [x.name for x in file_src]
 
109
 
110
  def __dump_to_json(self):
111
  with open(
112
+ os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json"),
113
+ "w",
114
+ encoding="utf-8",
115
  ) as f:
116
  print(
117
  "Dumping to json, the path is: "
118
  + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.json")
119
  )
120
+ self.json_result_path = os.path.join(
121
+ os.getcwd(), f"{self.uid}_knowledge_base.json"
122
+ )
123
  json.dump(self.files_info, f, indent=4, ensure_ascii=False)
124
 
125
  def __construct_knowledge_base_dataframe(self):
 
145
 
146
  def __dump_to_csv(self):
147
  df = self.__construct_knowledge_base_dataframe()
148
+ df.to_csv(
149
+ os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv"), index=False
150
+ )
151
  print(
152
  "Dumping to csv, the path is: "
153
  + os.path.join(os.getcwd(), f"{self.uid}_knowledge_base.csv")
154
  )
155
+ self.csv_result_path = os.path.join(
156
+ os.getcwd(), f"{self.uid}_knowledge_base.csv"
157
+ )
158
 
159
  def __get_file_name(self, file_src):
160
  file_paths = [x.name for x in file_src]